diff --git a/.travis.yml b/.travis.yml
index bde0e202d..2a221e3bd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -224,12 +224,21 @@ matrix:
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
         - brew update
-        - brew install gcc@10
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
-        
+        - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
+
+    - <<: *test-macos
+      osx_image: xcode12
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+        - brew update
+      script:
+        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+      env:
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"      
+
   #  - <<: *test-macos
   #    osx_image: xcode10
   #    env:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3107ef9a9..d0313c842 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 14)
+set(OpenBLAS_PATCH_VERSION 14.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
diff --git a/Changelog.txt b/Changelog.txt
index 5662bc5c6..6c5cf573e 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,54 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.15
+  2-May-2021
+
+common:
+ - imported improvements and bugfixes from Reference-LAPACK 3.9.1
+ - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
+ - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
+ - fixed a sequence problem in the generation of softlinks to the library in GMAKE
+
+RISC V:
+ - fixed compilation on RISCV (missing entry in getarch)
+ - fixed a potential division by zero in CROTG and ZROTG
+
+POWER:
+ - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
+ - improved CGEMM, DGEMM and ZGEMM performance on POWER10
+ - added an optimized ZGEMV kernel for POWER10
+ - fixed a potential division by zero in CROTG and ZROTG
+
+x86_64:
+ - added support for Intel Control-flow Enforcement Technology (CET)
+ - reverted the DOMATCOPY_RT code to the generic C version
+ - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
+ - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
+ - added support for compilation of the benchmarks on older OSX versions
+ - fix propagation of the NO_AVX512 option in CMAKE builds
+ - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
+ - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
+ - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
+
+ARM:
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
+
+ARM64:
+ - fixed spurious reads outside the array in the SGEMM tcopy macro
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
+
+MIPS
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
+
+MIPS64:
+ - fixed a potential division by zero in CROTG and ZROTG
+
+SPARC:
+ - fixed a potential division by zero in CROTG and ZROTG
+
 ====================================================================
 Version 0.3.14
  17-Mar-2021
diff --git a/Makefile b/Makefile
index de0735c4a..555d1c467 100644
--- a/Makefile
+++ b/Makefile
@@ -167,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
 	$(error OpenBLAS: neither static nor shared are enabled.)
 endif
 endif
-	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@for d in $(SUBDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -196,6 +195,7 @@ endif
 ifdef USE_THREAD
 	@echo USE_THREAD=$(USE_THREAD) >>  Makefile.conf_last
 endif
+	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@touch lib.grd
 
 prof : prof_blas prof_lapack
diff --git a/Makefile.rule b/Makefile.rule
index 5a46bf6b0..38d0161a3 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.14
+VERSION = 0.3.14.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.x86 b/Makefile.x86
index 0e27264d8..893379c33 100644
--- a/Makefile.x86
+++ b/Makefile.x86
@@ -1,10 +1,21 @@
 # COMPILER_PREFIX = mingw32-
 
-ifdef HAVE_SSE
-CCOMMON_OPT += -msse
-FCOMMON_OPT += -msse
+ifndef DYNAMIC_ARCH
+ADD_CPUFLAGS = 1
+else
+ifdef TARGET_CORE
+ADD_CPUFLAGS = 1
+endif
 endif
 
+ifdef ADD_CPUFLAGS
+ifdef HAVE_SSE
+CCOMMON_OPT += -msse
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -msse
+endif
+endif
+endif
 
 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x86
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 5406494c9..f62ab9e5e 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -8,6 +8,16 @@ endif
 endif
 endif
 
+
+ifndef DYNAMIC_ARCH
+ADD_CPUFLAGS = 1
+else
+ifdef TARGET_CORE
+ADD_CPUFLAGS = 1
+endif
+endif
+
+ifdef ADD_CPUFLAGS
 ifdef HAVE_SSE3
 CCOMMON_OPT += -msse3
 ifneq ($(F_COMPILER), NAG)
@@ -44,7 +54,6 @@ endif
 endif
 
 ifeq ($(CORE), SKYLAKEX)
-ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 CCOMMON_OPT += -march=skylake-avx512
 ifneq ($(F_COMPILER), NAG)
@@ -62,10 +71,8 @@ endif
 endif
 endif
 endif
-endif
 
 ifeq ($(CORE), COOPERLAKE)
-ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
@@ -88,7 +95,6 @@ endif
 endif
 endif
 endif
-endif
 
 ifdef HAVE_AVX2
 ifndef NO_AVX2
@@ -120,6 +126,7 @@ endif
 endif
 endif
 
+endif
 
 
 ifeq ($(OSNAME), Interix)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 639cb3558..56a3fd4ae 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,7 +4,15 @@ trigger:
   branches:
     include:
       - develop
-
+resources:
+  containers:
+      - container: oneapi-hpckit
+        image: intel/oneapi-hpckit:latest
+        options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
+      - container: oneapi-basekit
+        image: intel/oneapi-basekit:latest
+        options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
+ 
 jobs:
 # manylinux1 is useful to test because the
 # standard Docker container uses an old version
@@ -68,4 +76,64 @@ jobs:
       dir
       openblas_utest.exe
   
-      
+- job: OSX_OpenMP
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: |
+      brew update
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
+     
+- job: OSX_GCC_Nothreads
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: |
+      brew update
+      make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
+     
+- job: OSX_OpenMP_Clang
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
+
+- job: OSX_Ifort_Clang
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+     MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      sudo mkdir -p /opt/intel
+      sudo chown $USER /opt/intel
+    displayName: prepare for cache restore
+  - task: Cache@2
+    inputs:
+      path: /opt/intel/oneapi
+      key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
+      cacheHitVar: CACHE_RESTORED
+  - script: | 
+      curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
+      hdiutil attach webimage.dmg
+      sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
+      installer_exit_code=$?
+      hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
+      exit $installer_exit_code
+    displayName: install
+    condition: ne(variables.CACHE_RESTORED, 'true')
+  - script: | 
+      source /opt/intel/oneapi/setvars.sh
+      make CC=/usr/local/opt/llvm/bin/clang FC=ifort
+ 
diff --git a/benchmark/bench.h b/benchmark/bench.h
index 83de8ab2b..c03d72bef 100644
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@@ -3,6 +3,8 @@
 #include <time.h>
 #ifdef __CYGWIN32__
 #include <sys/time.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
 #endif
 #include "common.h"
 
diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 73f2592ef..0e45d4c63 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -66,7 +66,7 @@ set(SLASRC
    slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
+   slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
    slarrv.f slartv.f
    slarz.f  slarzb.f slarzt.f slasy2.f
    slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@@ -112,14 +112,14 @@ set(SLASRC
    sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
    stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
    sgelqt.f sgelqt3.f sgemlqt.f
-   sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
+   sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
    sgelq.f slaswlq.f slamswlq.f sgemlq.f
    stplqt.f stplqt2.f stpmlqt.f
    ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
    ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
-   slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -171,7 +171,7 @@ set(CLASRC
    claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarfb.f clarfg.f clarfgp.f clarft.f
+   clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
    clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
    clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f
    clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -209,14 +209,14 @@ set(CLASRC
    cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
    ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
    cgelqt.f cgelqt3.f cgemlqt.f
-   cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
+   cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
    cgelq.f claswlq.f clamswlq.f cgemlq.f
    ctplqt.f ctplqt2.f ctpmlqt.f
    chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
    cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
-   cungtsqr.f cunhr_col.f )
+   cungtsqr.f cungtsqr_row.f cunhr_col.f )
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -253,7 +253,7 @@ set(DLASRC
    dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
    dlargv.f dlarrv.f dlartv.f
    dlarz.f  dlarzb.f dlarzt.f dlasy2.f
    dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@@ -300,14 +300,14 @@ set(DLASRC
    dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
    dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
    dgelqt.f dgelqt3.f dgemlqt.f
-   dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
+   dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
    dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
    dtplqt.f dtplqt2.f dtpmlqt.f
    dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
    dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
-   dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -360,7 +360,7 @@ set(ZLASRC
    zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
    zlarfg.f zlarfgp.f zlarft.f
    zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
    zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
@@ -402,13 +402,13 @@ set(ZLASRC
    ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
    ztplqt.f ztplqt2.f ztpmlqt.f
    zgelqt.f zgelqt3.f zgemlqt.f
-   zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
+   zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
    zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
    zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
    zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
-   zungtsqr.f zunhr_col.f)
+   zungtsqr.f zungtsqr_row.f zunhr_col.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 54a583887..340ea6d6c 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -114,6 +114,8 @@ set(CSRC
   lapacke_cgetrs_work.c
   lapacke_cgetsls.c
   lapacke_cgetsls_work.c
+  lapacke_cgetsqrhrt.c
+  lapacke_cgetsqrhrt_work.c 
   lapacke_cggbak.c
   lapacke_cggbak_work.c
   lapacke_cggbal.c
@@ -590,6 +592,8 @@ set(CSRC
   lapacke_cungrq_work.c
   lapacke_cungtr.c
   lapacke_cungtr_work.c
+  lapacke_cungtsqr_row.c
+  lapacke_cungtsqr_row_work.c
   lapacke_cunmbr.c
   lapacke_cunmbr_work.c
   lapacke_cunmhr.c
@@ -735,6 +739,8 @@ set(DSRC
   lapacke_dgetrs_work.c
   lapacke_dgetsls.c
   lapacke_dgetsls_work.c
+  lapacke_dgetsqrhrt.c
+  lapacke_dgetsqrhrt_work.c
   lapacke_dggbak.c
   lapacke_dggbak_work.c
   lapacke_dggbal.c
@@ -862,6 +868,8 @@ set(DSRC
   lapacke_dorgrq_work.c
   lapacke_dorgtr.c
   lapacke_dorgtr_work.c
+  lapacke_dorgtsqr_row.c
+  lapacke_dorgtsqr_row_work.c
   lapacke_dormbr.c
   lapacke_dormbr_work.c
   lapacke_dormhr.c
@@ -1309,6 +1317,8 @@ set(SSRC
   lapacke_sgetrs_work.c
   lapacke_sgetsls.c
   lapacke_sgetsls_work.c
+  lapacke_sgetsqrhrt.c
+  lapacke_sgetsqrhrt_work.c
   lapacke_sggbak.c
   lapacke_sggbak_work.c
   lapacke_sggbal.c
@@ -1435,6 +1445,8 @@ set(SSRC
   lapacke_sorgrq_work.c
   lapacke_sorgtr.c
   lapacke_sorgtr_work.c
+  lapacke_sorgtsqr_row.c
+  lapacke_sorgtsqr_row_work.c
   lapacke_sormbr.c
   lapacke_sormbr_work.c
   lapacke_sormhr.c
@@ -1877,6 +1889,8 @@ set(ZSRC
   lapacke_zgetrs_work.c
   lapacke_zgetsls.c
   lapacke_zgetsls_work.c
+  lapacke_zgetsqrhrt.c
+  lapacke_zgetsqrhrt_work.c
   lapacke_zggbak.c
   lapacke_zggbak_work.c
   lapacke_zggbal.c
@@ -2351,6 +2365,8 @@ set(ZSRC
   lapacke_zungrq_work.c
   lapacke_zungtr.c
   lapacke_zungtr_work.c
+  lapacke_zungtsqr_row.c
+  lapacke_zungtsqr_row_work.c
   lapacke_zunmbr.c
   lapacke_zunmbr_work.c
   lapacke_zunmhr.c
diff --git a/cmake/system.cmake b/cmake/system.cmake
index eee429113..d6c71b774 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -299,6 +299,10 @@ if (NO_AVX2)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
 endif ()
 
+if (NO_AVX512)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
+endif ()
+
 if (USE_THREAD)
   # USE_SIMPLE_THREADED_LEVEL3 = 1
   # NO_AFFINITY = 1
diff --git a/common.h b/common.h
index 862e0b4db..ac795937c 100644
--- a/common.h
+++ b/common.h
@@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_alpha.h"
 #endif
 
+#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+#ifndef _CET_ENDBR
+#define _CET_ENDBR
+#endif
+
 #ifdef ARCH_X86
 #include "common_x86.h"
 #endif
diff --git a/common_x86.h b/common_x86.h
index ec928e236..bc77eca58 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -340,7 +340,8 @@ REALNAME:
 	.align 16; \
 	.globl REALNAME ;\
        .type REALNAME, @function; \
-REALNAME:
+REALNAME: \
+	_CET_ENDBR
 
 #ifdef PROFILE
 #define PROFCODE call mcount
diff --git a/common_x86_64.h b/common_x86_64.h
index b813336c6..729a055ce 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -451,7 +451,8 @@ REALNAME:
 	.align 512; \
 	.globl REALNAME ;\
        .type REALNAME, @function; \
-REALNAME:
+REALNAME: \
+	_CET_ENDBR
 
 #ifdef PROFILE
 #define PROFCODE call *mcount@GOTPCREL(%rip)
diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c
index 057096f32..6511e5271 100644
--- a/ctest/c_cblas2.c
+++ b/ctest/c_cblas2.c
@@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A  = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
+     A  = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
 		 *incx, beta, y, *incy );
      else {
         LDA = *k+2;
-        A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 	         beta, y, *incy);
      else {
         LDA = *n;
-        A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
-        AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
+        AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 	        sizeof( CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
 	x, *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
 	         *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
+        A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
-        AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 	 	sizeof(CBLAS_TEST_COMPLEX));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
-        AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 		sizeof(CBLAS_TEST_COMPLEX));
      	if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA=*n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
 	  A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
         cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
      else {
         LDA = *n;
-        A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
-        AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+        AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 		sizeof( CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 		     *incy, ap );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
-        AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+        AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
 	sizeof( CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
@@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c
index 423a58748..ae3854c0e 100644
--- a/ctest/c_dblas2.c
+++ b/ctest/c_dblas2.c
@@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
 
      for( i=0; i<*m; i++ ) {
        for( j=0; j<*n; j++ )
@@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
          A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A   = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
         irow=*k-i;
@@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A   = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
diff --git a/ctest/c_dblas3.c b/ctest/c_dblas3.c
index 85d7913c0..936dea8d9 100644
--- a/ctest/c_dblas3.c
+++ b/ctest/c_dblas3.c
@@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (transa == CblasNoTrans) {
         LDA = *k+1;
-        A = (double *)malloc( (*m)*LDA*sizeof( double ) );
+        A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else {
         LDA = *m+1;
-        A   = ( double* )malloc( LDA*(*k)*sizeof( double ) );
+        A   = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      if (transb == CblasNoTrans) {
         LDB = *n+1;
-        B   = ( double* )malloc( (*k)*LDB*sizeof( double ) );
+        B   = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      else {
         LDB = *k+1;
-        B   = ( double* )malloc( LDB*(*n)*sizeof( double ) );
+        B   = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      LDC = *n+1;
-     C   = ( double* )malloc( (*m)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
      LDC = *n+1;
-     C   = ( double* )malloc( (*m)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*k)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDC = *n+1;
-     C   = ( double* )malloc( (*n)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
-        B   = ( double* )malloc( (*n)*LDB*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
+        B   = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A   = ( double* )malloc( LDA*(*k)*sizeof( double ) );
-        B   = ( double* )malloc( LDB*(*k)*sizeof( double ) );
+        A   = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
+        B   = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C   = ( double* )malloc( (*n)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
@@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
diff --git a/ctest/c_sblas2.c b/ctest/c_sblas2.c
index 6cbc074c7..fea4ca1ab 100644
--- a/ctest/c_sblas2.c
+++ b/ctest/c_sblas2.c
@@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
 
      for( i=0; i<*m; i++ ) {
        for( j=0; j<*n; j++ )
@@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
          A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A   = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
         irow=*k-i;
@@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A   = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
diff --git a/ctest/c_sblas3.c b/ctest/c_sblas3.c
index e3977d026..10dc049a8 100644
--- a/ctest/c_sblas3.c
+++ b/ctest/c_sblas3.c
@@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (transa == CblasNoTrans) {
         LDA = *k+1;
-        A = (float *)malloc( (*m)*LDA*sizeof( float ) );
+        A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else {
         LDA = *m+1;
-        A   = ( float* )malloc( LDA*(*k)*sizeof( float ) );
+        A   = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      if (transb == CblasNoTrans) {
         LDB = *n+1;
-        B   = ( float* )malloc( (*k)*LDB*sizeof( float ) );
+        B   = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      else {
         LDB = *k+1;
-        B   = ( float* )malloc( LDB*(*n)*sizeof( float ) );
+        B   = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      LDC = *n+1;
-     C   = ( float* )malloc( (*m)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
      LDC = *n+1;
-     C   = ( float* )malloc( (*m)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*k)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDC = *n+1;
-     C   = ( float* )malloc( (*n)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
-        B   = ( float* )malloc( (*n)*LDB*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
+        B   = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A   = ( float* )malloc( LDA*(*k)*sizeof( float ) );
-        B   = ( float* )malloc( LDB*(*k)*sizeof( float ) );
+        A   = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
+        B   = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C   = ( float* )malloc( (*n)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
@@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
diff --git a/ctest/c_zblas2.c b/ctest/c_zblas2.c
index 8854dcc6d..ca601e606 100644
--- a/ctest/c_zblas2.c
+++ b/ctest/c_zblas2.c
@@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A  = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
+     A  = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
 		 *incx, beta, y, *incy );
      else {
         LDA = *k+2;
-        A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 	         beta, y, *incy);
      else {
         LDA = *n;
-        A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
-        AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 	        sizeof( CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
 	x, *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
 	         *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 	 	sizeof(CBLAS_TEST_ZOMPLEX));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 		sizeof(CBLAS_TEST_ZOMPLEX));
      	if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA=*n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
 	  A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
         cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
      else {
         LDA = *n;
-        A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 		sizeof( CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 		     *incy, ap );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
 	sizeof( CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
@@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c
index 40afa4edf..aac46ddfa 100644
--- a/ctest/c_zblas3.c
+++ b/ctest/c_zblas3.c
@@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (transa == CblasNoTrans) {
         LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
      }
      else {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*k; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
 
      if (transb == CblasNoTrans) {
         LDB = *n+1;
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ) {
               B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
      }
      else {
         LDB = *k+1;
-        B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
      }
 
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
            }
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ) {
            B[i*LDB+j].real=b[j*(*ldb)+i].real;
            B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
         }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
 	      A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
 	      A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
            }
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ) {
            B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
            }
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ) {
            B[i*LDB+j].real=b[j*(*ldb)+i].real;
diff --git a/ctest/constant.c b/ctest/constant.c
index 861d70bcc..5a26a4dde 100644
--- a/ctest/constant.c
+++ b/ctest/constant.c
@@ -1,3 +1,4 @@
+#include "cblas_test.h"
 int CBLAS_CallFromC;
 int RowMajorStrg;
 
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 6c68ba98a..0b623c3ac 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg);
 #endif
 
 #define get_cpu_ftr(id, var) ({					\
-		__asm__ ("mrs %0, "#id : "=r" (var));		\
+		__asm__ __volatile__ ("mrs %0, "#id : "=r" (var));		\
 	})
 
 static char *corename[] = {
diff --git a/exports/Makefile b/exports/Makefile
index eec0593aa..903836dd6 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -139,9 +139,13 @@ endif
 ifneq (,$(filter 1 2,$(NOFORTRAN)))
 #only build without Fortran
 	$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
+else
+ifeq ($(F_COMPILER), INTEL)
+	$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
 else
 	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 endif
+endif
 
 dllinit.$(SUFFIX) : dllinit.c
 	$(CC) $(CFLAGS) -c -o $(@F) -s $<
diff --git a/f_check b/f_check
index 20281ea4d..2c0d7fcb9 100644
--- a/f_check
+++ b/f_check
@@ -391,10 +391,6 @@ if ($link ne "") {
 
 }
 
-if ($vendor eq "INTEL"){
-    $linker_a .= "-lgfortran"
-}
-
 if ($vendor eq "FLANG"){
     $linker_a .= "-lflang"
 }
diff --git a/getarch_2nd.c b/getarch_2nd.c
index 53ecccf30..dd1f83089 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -4,7 +4,15 @@
 #else
 #include "config_kernel.h"
 #endif
-#include "common.h"
+#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#include "param.h"
 
 int main(int argc, char **argv) {
 
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 93ffd69f9..91975f7f4 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 
 	if ( *lda >  *ldb )
-		msize = (*lda) * (*ldb)  * sizeof(FLOAT);
+		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
 	else
-		msize = (*ldb) * (*ldb)  * sizeof(FLOAT);
+		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index 87964e20d..ecda5ef4e 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 
 	if ( *lda >  *ldb )
-                msize = (*lda) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
         else
-                msize = (*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
 
         b = malloc(msize);
         if ( b == NULL )
diff --git a/interface/zrotg.c b/interface/zrotg.c
index 8caa411fc..bc4f06492 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
       aa_i = fabs(da_r);
     }
 
-    scale = (aa_i / aa_r);
-    ada = aa_r * sqrt(ONE + scale * scale);
+    if (aa_r == ZERO) {
+	ada = 0.;
+    } else {
+        scale = (aa_i / aa_r);
+        ada = aa_r * sqrt(ONE + scale * scale);
+    }
 
     bb_r = fabs(db_r);
     bb_i = fabs(db_i);
@@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
       bb_i = fabs(bb_r);
     }
 
-    scale = (bb_i / bb_r);
-    adb = bb_r * sqrt(ONE + scale * scale);
-
+    if (bb_r == ZERO) {
+	adb = 0.;
+    } else {
+    	scale = (bb_i / bb_r);
+    	adb = bb_r * sqrt(ONE + scale * scale);
+    }
     scale = ada + adb;
 
     aa_r    = da_r / scale;
diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
index 12b80bdca..46198b3a2 100644
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@@ -270,11 +270,6 @@ All rights reserved.
 	ldr	s1, [A02]
 	ldr	s2, [A03]
 	ldr	s3, [A04]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-	add	A03, A03, #4
-	add	A04, A04, #4
 
 	stp	s0, s1, [B04]
 	add	B04, B04, #8
@@ -285,11 +280,6 @@ All rights reserved.
 	ldr	s5, [A06]
 	ldr	s6, [A07]
 	ldr	s7, [A08]
-	
-	ldr	d4, [A05], #8
-	ldr	d5, [A06], #8
-	ldr	d6, [A07], #8
-	ldr	d7, [A08], #8
 
 	stp	s4, s5, [B04]
 	add	B04, B04, #8
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 1cf7b0b7c..873653f1e 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -169,8 +169,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
@@ -181,7 +186,7 @@ ZSWAPKERNEL  = zswap.c
 SGEMVNKERNEL = sgemv_n.c
 DGEMVNKERNEL = dgemv_n_power10.c
 CGEMVNKERNEL = cgemv_n.c
-ZGEMVNKERNEL = zgemv_n_4.c
+ZGEMVNKERNEL =  zgemv_n_power10.c
 #
 SGEMVTKERNEL = sgemv_t.c
 DGEMVTKERNEL = dgemv_t_power10.c
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index c2f4cd204..2b8e65948 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -242,8 +242,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 2bd2516de..b6b102b3e 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -166,8 +166,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
index c53fe0c02..b9e2d2ce5 100644
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #include "common.h"
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "cdot_microk_power10.c"
 #else
 #ifndef HAVE_KERNEL_8
@@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
     if ((inc_x == 1) && (inc_y == 1)) {
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
         BLASLONG n1 = n & -16;
 #else
         BLASLONG n1 = n & -8;
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
index 4d9b9ccd6..c2fde1c44 100644
--- a/kernel/power/cswap.c
+++ b/kernel/power/cswap.c
@@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8)  || defined(POWER9)
 #include "cswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "cswap_microk_power10.c"
+#elif defined(POWER10)
+#include "cswap_microk_power8.c"
 #endif
 #endif
 
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 0cdec3292..7507621cf 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dasum_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "dasum_microk_power10.c"
+#elif defined(POWER10)
+#include "dasum_microk_power8.c"
 #endif
 #endif
 
@@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
index e918e61c3..cdd846891 100644
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
-	  vec_t *rb = (vec_t *) & BO[0];
 	  __vector_pair rowB, rowB1;
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 3];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB, rowB1;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 2];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB, rowB1;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 1];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	    }
@@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
@@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 3];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 2];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
 	    }
@@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 1];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	    }
 	  SAVE_ACC (&acc0, 0);
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index 94d9d95a3..3229878e4 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "drot_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "drot_microk_power10.c"
+#elif defined(POWER10)
+#include "drot_microk_power8.c"
 #endif
 #endif
 
@@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index 96c4e51bc..32c39a8f4 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dscal_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "dscal_microk_power10.c"
+#elif defined(POWER10)
+#include "dscal_microk_power8.c"
 #endif
 #endif
 
@@ -102,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
@@ -136,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
index 9e6229c6a..12476965b 100644
--- a/kernel/power/dswap.c
+++ b/kernel/power/dswap.c
@@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "swap_microk_power10.c"
+#elif defined(POWER10)
+#include "dswap_microk_power8.c"
 #endif
 #endif
 
@@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 32 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index af692a7fa..991d27508 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sasum_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "sasum_microk_power10.c"
+#elif defined(POWER10)
+#include "sasum_microk_power8.c"
 #endif
 #endif
 
@@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 32 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index 3e4f93e2a..5a0d4b12e 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "srot_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "srot_microk_power10.c"
+#elif defined(POWER10)
+#include "srot_microk_power8.c"
 #endif
 #endif
 
@@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index 65572a8c1..9ae9ccab8 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sscal_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "sscal_microk_power10.c"
+#elif defined(POWER10)
+#include "sscal_microk_power8.c"
 #endif
 #endif
 
@@ -104,7 +106,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
@@ -138,7 +140,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
index dd249fd36..955ed02f0 100644
--- a/kernel/power/sswap.c
+++ b/kernel/power/sswap.c
@@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "swap_microk_power10.c"
+#elif defined(POWER10)
+#include "sswap_microk_power8.c"
 #endif
 #endif
 
@@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 64 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c
new file mode 100644
index 000000000..f5bb8d70e
--- /dev/null
+++ b/kernel/power/zgemv_n_power10.c
@@ -0,0 +1,1102 @@
+/***************************************************************************
+Copyright (c) 2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
+#define HAVE_KERNEL_4x4_VEC 1
+#define HAVE_KERNEL_4x2_VEC 1
+#define HAVE_KERNEL_4x1_VEC 1
+#define HAVE_KERNEL_ADDY 1
+
+#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
+#include <altivec.h> 
+#endif
+#endif
+
+// 
+#define NBMAX 4096
+
+#ifdef HAVE_KERNEL_4x4_VEC_ASM
+
+#elif HAVE_KERNEL_4x4_VEC
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define SAVE_RESULT(ACC, J)  \
+        __builtin_mma_disassemble_acc ((void *)result, ACC); \
+        result[0][0] = result[0][0] - result[1][1]; \
+	result[0][1] = result[0][1] + result[1][0]; \
+	result[1][0] = result[2][0] - result[3][1]; \
+	result[1][1] = result[2][1] + result[3][0]; \
+	rowC = (v4sf_t *) &y[i2 + J]; \
+	rowC[0] += result[0]; \
+	rowC[1] += result[1];
+#else
+#define SAVE_RESULT(ACC, J)  \
+        __builtin_mma_disassemble_acc ((void *)result, ACC); \
+        result[0][0] = result[0][0] + result[1][1]; \
+	result[0][1] = result[0][1] - result[1][0]; \
+	result[1][0] = result[2][0] + result[3][1]; \
+	result[1][1] = result[2][1] - result[3][0]; \
+	rowC = (v4sf_t *) &y[i2 + J]; \
+	rowC[0] += result[0]; \
+	rowC[1] += result[1];
+#endif
+
+static void zgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector_quad acc0, acc1, acc2, acc3;
+    v4sf_t result[4];
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+
+    register __vector double vx0_r = {x[0], x[1]};
+    register __vector double vx1_r = {x[2], x[3]};
+    register __vector double vx2_r = {x[4], x[5]};
+    register __vector double vx3_r = {x[6], x[7]};
+    register __vector double vx4_r = {x[8], x[9]};
+    register __vector double vx5_r = {x[10], x[11]};
+    register __vector double vx6_r = {x[12], x[13]};
+    register __vector double vx7_r = {x[14], x[15]};
+    __vector_pair *Va0, *Va1, *Va2, *Va3;
+    __vector_pair *Va4, *Va5, *Va6, *Va7;
+    BLASLONG  i = 0, i2 = 0;
+    v4sf_t *rowC;
+    BLASLONG tmp = (n / 8) * 8;
+    for (i = 0; i < tmp; i += 8) {
+	i2 = i*2;
+	Va0  = ((__vector_pair*)((void*)&a0[i2]));
+	Va1  = ((__vector_pair*)((void*)&a1[i2]));
+	Va2  = ((__vector_pair*)((void*)&a2[i2]));
+	Va3  = ((__vector_pair*)((void*)&a3[i2]));
+	Va4  = ((__vector_pair*)((void*)&a4[i2]));
+	Va5  = ((__vector_pair*)((void*)&a5[i2]));
+	Va6  = ((__vector_pair*)((void*)&a6[i2]));
+	Va7  = ((__vector_pair*)((void*)&a7[i2]));
+
+	__builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r);
+	__builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r);
+	__builtin_mma_xvf64ger (&acc2, Va0[2], (vec_t ) vx0_r);
+	__builtin_mma_xvf64ger (&acc3, Va0[3], (vec_t ) vx0_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va1[2], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va1[3], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va2[2], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va2[3], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va3[2], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va3[3], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va4[2], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va4[3], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va5[2], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va5[3], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va6[2], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va6[3], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va7[2], (vec_t ) vx7_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va7[3], (vec_t ) vx7_r);
+	SAVE_RESULT(&acc0, 0);
+	SAVE_RESULT(&acc1, 4);
+	SAVE_RESULT(&acc2, 8);
+	SAVE_RESULT(&acc3, 12);
+    }
+    while (i < n) {
+	i2 = i*2;
+	Va0  = ((__vector_pair*)((void*)&a0[i2]));
+	Va1  = ((__vector_pair*)((void*)&a1[i2]));
+	Va2  = ((__vector_pair*)((void*)&a2[i2]));
+	Va3  = ((__vector_pair*)((void*)&a3[i2]));
+	Va4  = ((__vector_pair*)((void*)&a4[i2]));
+	Va5  = ((__vector_pair*)((void*)&a5[i2]));
+	Va6  = ((__vector_pair*)((void*)&a6[i2]));
+	Va7  = ((__vector_pair*)((void*)&a7[i2]));
+
+	__builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r);
+	__builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r);
+	SAVE_RESULT(&acc0, 0);
+	SAVE_RESULT(&acc1, 4);
+	i += 4;
+    }
+}
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+  
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector double vx0_r = {x[0], x[0]};
+    register __vector double vx0_i = {-x[1], x[1]};
+    register __vector double vx1_r = {x[2], x[2]};
+    register __vector double vx1_i = {-x[3], x[3]};
+    register __vector double vx2_r = {x[4], x[4]};
+    register __vector double vx2_i = {-x[5], x[5]};
+    register __vector double vx3_r = {x[6], x[6]};
+    register __vector double vx3_i = {-x[7], x[7]};
+
+#else
+    register __vector double vx0_r = {x[0], -x[0]};
+    register __vector double vx0_i = {x[1], x[1]};
+    register __vector double vx1_r = {x[2], -x[2]};
+    register __vector double vx1_i = {x[3], x[3]};
+    register __vector double vx2_r = {x[4], -x[4]};
+    register __vector double vx2_i = {x[5], x[5]};
+    register __vector double vx3_r = {x[6], -x[6]};
+    register __vector double vx3_i = {x[7], x[7]};
+#endif
+
+    register __vector double *vy = (__vector double *) y;
+    register __vector double *vptr_a0 = (__vector double *) a0;
+    register __vector double *vptr_a1 = (__vector double *) a1;
+    register __vector double *vptr_a2 = (__vector double *) a2;
+    register __vector double *vptr_a3 = (__vector double *) a3;
+ 
+   
+    register __vector double vy_0;
+    register __vector double va0;
+    register __vector double va1;
+    register __vector double va2;
+    register __vector double va3;
+    register __vector double vy_1;
+    register __vector double va0_1;
+    register __vector double va1_1;
+    register __vector double va2_1;
+    register __vector double va3_1;
+    register __vector double vy_2;
+    register __vector double va0_2;
+    register __vector double va1_2;
+    register __vector double va2_2;
+    register __vector double va3_2;
+    register __vector double vy_3;
+    register __vector double va0_3;
+    register __vector double va1_3;
+    register __vector double va2_3;
+    register __vector double va3_3;
+      
+     BLASLONG  i = 0; 
+    while (i < n) {
+
+        vy_0 = vy[i];
+        va0 = vptr_a0[i];
+        va1 = vptr_a1[i];
+        va2 = vptr_a2[i];
+        va3 = vptr_a3[i];
+
+        vy_1 = vy[i + 1];
+        va0_1 = vptr_a0[i + 1];
+        va1_1 = vptr_a1[i + 1];
+        va2_1 = vptr_a2[i + 1];
+        va3_1 = vptr_a3[i + 1];
+
+        vy_2 = vy[i + 2];
+        va0_2 = vptr_a0[i + 2];
+        va1_2 = vptr_a1[i + 2];
+        va2_2 = vptr_a2[i + 2];
+        va3_2 = vptr_a3[i + 2];
+
+        vy_3 = vy[i + 3];
+        va0_3 = vptr_a0[i + 3];
+        va1_3 = vptr_a1[i + 3];
+        va2_3 = vptr_a2[i + 3];
+        va3_3 = vptr_a3[i + 3];
+
+        vy_0 += va0*vx0_r;
+        vy_1 += va0_1*vx0_r;
+        vy_2 += va0_2*vx0_r;
+        vy_3 += va0_3*vx0_r;
+ 
+
+        vy_0 += va1*vx1_r;
+        vy_1 += va1_1*vx1_r;
+        vy_2 += va1_2*vx1_r;
+        vy_3 += va1_3*vx1_r;
+
+        va0 = vec_xxpermdi(va0, va0, 2);
+        va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
+
+
+        vy_0 += va2*vx2_r;
+        vy_1 += va2_1*vx2_r;
+        va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
+        vy_2 += va2_2*vx2_r;
+        vy_3 += va2_3*vx2_r;
+
+        va1 = vec_xxpermdi(va1, va1, 2);
+        va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
+
+
+        vy_0 += va3*vx3_r;
+        vy_1 += va3_1*vx3_r;
+
+        va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
+        va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
+
+        vy_2 += va3_2*vx3_r;
+        vy_3 += va3_3*vx3_r;
+
+        va2 = vec_xxpermdi(va2, va2, 2);
+        va2_1 = vec_xxpermdi(va2_1, va2_1, 2);
+
+
+        vy_0 += va0*vx0_i;
+        vy_1 += va0_1*vx0_i;
+
+        va2_2 = vec_xxpermdi(va2_2, va2_2, 2);
+        va2_3 = vec_xxpermdi(va2_3, va2_3, 2);
+
+        vy_2 += va0_2*vx0_i;
+        vy_3 += va0_3*vx0_i;
+
+        va3 = vec_xxpermdi(va3, va3, 2);
+        va3_1 = vec_xxpermdi(va3_1, va3_1, 2);
+
+
+        vy_0 += va1*vx1_i;
+        vy_1 += va1_1*vx1_i;
+
+        va3_2 = vec_xxpermdi(va3_2, va3_2, 2);
+        va3_3 = vec_xxpermdi(va3_3, va3_3, 2);
+
+        vy_2 += va1_2*vx1_i;
+        vy_3 += va1_3*vx1_i;
+
+        vy_0 += va2*vx2_i;
+        vy_1 += va2_1*vx2_i;
+        vy_2 += va2_2*vx2_i;
+        vy_3 += va2_3*vx2_i;
+
+        vy_0 += va3*vx3_i;
+        vy_1 += va3_1*vx3_i;
+        vy_2 += va3_2*vx3_i;
+        vy_3 += va3_3*vx3_i;
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+
+
+        i += 4;
+         
+       
+    }
+
+}
+#else
+
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+
+    for (i = 0; i < 2 * n; i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
+        y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
+        y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
+        y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
+        y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
+#else 
+        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
+        y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
+        y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
+        y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
+        y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
+#endif
+    }
+}
+
+#endif
+
+#ifdef  HAVE_KERNEL_4x2_VEC
+
+static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector double vx0_r = {x[0], x[0]};
+    register __vector double vx0_i = {-x[1], x[1]};
+    register __vector double vx1_r = {x[2], x[2]};
+    register __vector double vx1_i = {-x[3], x[3]};
+
+#else
+    register __vector double vx0_r = {x[0], -x[0]};
+    register __vector double vx0_i = {x[1], x[1]};
+    register __vector double vx1_r = {x[2], -x[2]};
+    register __vector double vx1_i = {x[3], x[3]};
+#endif
+
+
+    register __vector double *vy = (__vector double *) y;
+    register __vector double *vptr_a0 = (__vector double *) a0;
+    register __vector double *vptr_a1 = (__vector double *) a1;
+
+    for (i = 0; i < n; i += 4) {
+
+        register __vector double vy_0 = vy[i];
+        register __vector double vy_1 = vy[i + 1];
+        register __vector double vy_2 = vy[i + 2];
+        register __vector double vy_3 = vy[i + 3];
+
+        register __vector double va0 = vptr_a0[i];
+        register __vector double va0_1 = vptr_a0[i + 1];
+        register __vector double va0_2 = vptr_a0[i + 2];
+        register __vector double va0_3 = vptr_a0[i + 3];
+
+        register __vector double va1 = vptr_a1[i];
+        register __vector double va1_1 = vptr_a1[i + 1];
+        register __vector double va1_2 = vptr_a1[i + 2];
+        register __vector double va1_3 = vptr_a1[i + 3];
+
+        vy_0 += va0*vx0_r;
+        vy_1 += va0_1*vx0_r;
+        vy_2 += va0_2*vx0_r;
+        vy_3 += va0_3*vx0_r;
+
+        va0 = vec_xxpermdi(va0, va0, 2);
+        va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
+        va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
+
+        vy_0 += va1*vx1_r;
+        vy_1 += va1_1*vx1_r;
+        vy_2 += va1_2*vx1_r;
+        vy_3 += va1_3*vx1_r;
+
+        va1 = vec_xxpermdi(va1, va1, 2);
+        va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
+        va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
+        va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
+
+        vy_0 += va0*vx0_i;
+        vy_1 += va0_1*vx0_i;
+        vy_2 += va0_2*vx0_i;
+        vy_3 += va0_3*vx0_i;
+
+        vy_0 += va1*vx1_i;
+        vy_1 += va1_1*vx1_i;
+        vy_2 += va1_2*vx1_i;
+        vy_3 += va1_3*vx1_i;
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+
+    }
+}
+#else
+
+static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+
+    for (i = 0; i < 2 * n; i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
+#else 
+        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
+#endif
+    }
+}
+
+#endif
+
+#ifdef  HAVE_KERNEL_4x1_VEC
+
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector double vx0_r = {x[0], x[0]};
+    register __vector double vx0_i = {-x[1], x[1]};
+
+#else
+    register __vector double vx0_r = {x[0], -x[0]};
+    register __vector double vx0_i = {x[1], x[1]};
+#endif
+
+
+    register __vector double *vy = (__vector double *) y;
+    register __vector double *vptr_a0 = (__vector double *) a0;
+
+    for (i = 0; i < n; i += 4) {
+
+        register __vector double vy_0 = vy[i];
+        register __vector double vy_1 = vy[i + 1];
+        register __vector double vy_2 = vy[i + 2];
+        register __vector double vy_3 = vy[i + 3];
+
+        register __vector double va0 = vptr_a0[i];
+        register __vector double va0_1 = vptr_a0[i + 1];
+        register __vector double va0_2 = vptr_a0[i + 2];
+        register __vector double va0_3 = vptr_a0[i + 3];
+
+        register __vector double va0x = vec_xxpermdi(va0, va0, 2);
+        register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
+        register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
+        vy_0 += va0*vx0_r + va0x*vx0_i;
+        vy_1 += va0_1*vx0_r +  va0x_1*vx0_i;
+        vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
+        vy_3 += va0_3*vx0_r + va0x_3*vx0_i; 
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+
+    }
+}
+
+#else
+
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+
+    for (i = 0; i < 2 * n; i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
+#else 
+        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+#endif
+
+    }
+}
+
+#endif
+
+#ifdef HAVE_KERNEL_ADDY
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+
+
+#if   !defined(XCONJ) 
+
+    register __vector double valpha_r = {alpha_r, alpha_r};
+    register __vector double valpha_i = {-alpha_i, alpha_i};
+
+#else
+    register __vector double valpha_r = {alpha_r, -alpha_r};
+    register __vector double valpha_i = {alpha_i, alpha_i};
+#endif
+
+    register __vector double *vptr_src = (__vector double *) src;
+    if (inc_dest != 2) {
+        register __vector double *vptr_y = (__vector double *) dest;
+        //note that inc_dest is already 2x. so we should add it to double*
+        register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
+        register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
+        register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
+        BLASLONG dest_t = 0;
+        BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4  we just multiply 2 times
+        for (i = 0; i < n; i += 4) {
+
+            register __vector double vy_0 = vptr_y[dest_t];
+            register __vector double vy_1 = vptr_y1[dest_t];
+            register __vector double vy_2 = vptr_y2[dest_t];
+            register __vector double vy_3 = vptr_y3[dest_t];
+
+            register __vector double vsrc = vptr_src[i];
+            register __vector double vsrc_1 = vptr_src[i + 1];
+            register __vector double vsrc_2 = vptr_src[i + 2];
+            register __vector double vsrc_3 = vptr_src[i + 3];
+
+            vy_0 += vsrc*valpha_r;
+            vy_1 += vsrc_1*valpha_r;
+            vy_2 += vsrc_2*valpha_r;
+            vy_3 += vsrc_3*valpha_r;
+
+            vsrc = vec_xxpermdi(vsrc, vsrc, 2);
+            vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
+            vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
+            vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
+
+            vy_0 += vsrc*valpha_i;
+            vy_1 += vsrc_1*valpha_i;
+            vy_2 += vsrc_2*valpha_i;
+            vy_3 += vsrc_3*valpha_i;
+
+            vptr_y[dest_t] = vy_0;
+            vptr_y1[dest_t ] = vy_1;
+            vptr_y2[dest_t] = vy_2;
+            vptr_y3[dest_t] = vy_3;
+
+            dest_t += add_dest;
+
+        }
+
+        return;
+    } else {
+        register __vector double *vptr_y = (__vector double *) dest;
+        for (i = 0; i < n; i += 4) {
+
+            register __vector double vy_0 = vptr_y[i];
+            register __vector double vy_1 = vptr_y[i + 1];
+            register __vector double vy_2 = vptr_y[i + 2];
+            register __vector double vy_3 = vptr_y[i + 3];
+
+            register __vector double vsrc = vptr_src[i];
+            register __vector double vsrc_1 = vptr_src[i + 1];
+            register __vector double vsrc_2 = vptr_src[i + 2];
+            register __vector double vsrc_3 = vptr_src[i + 3];
+
+            vy_0 += vsrc*valpha_r;
+            vy_1 += vsrc_1*valpha_r;
+            vy_2 += vsrc_2*valpha_r;
+            vy_3 += vsrc_3*valpha_r;
+
+            vsrc = vec_xxpermdi(vsrc, vsrc, 2);
+            vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
+            vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
+            vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
+
+            vy_0 += vsrc*valpha_i;
+            vy_1 += vsrc_1*valpha_i;
+            vy_2 += vsrc_2*valpha_i;
+            vy_3 += vsrc_3*valpha_i;
+
+            vptr_y[i] = vy_0;
+            vptr_y[i + 1 ] = vy_1;
+            vptr_y[i + 2] = vy_2;
+            vptr_y[i + 3] = vy_3;
+
+        }
+
+        return;
+    }
+    return;
+}
+
+#else
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+
+    if (inc_dest != 2) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        for (i = 0; i < n; i++) {
+#if !defined(XCONJ) 
+            temp_r = alpha_r * src[0] - alpha_i * src[1];
+            temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+            temp_r = alpha_r * src[0] + alpha_i * src[1];
+            temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+            *dest += temp_r;
+            *(dest + 1) += temp_i;
+
+            src += 2;
+            dest += inc_dest;
+        }
+        return;
+    }
+
+    FLOAT temp_r0;
+    FLOAT temp_i0;
+    FLOAT temp_r1;
+    FLOAT temp_i1;
+    FLOAT temp_r2;
+    FLOAT temp_i2;
+    FLOAT temp_r3;
+    FLOAT temp_i3;
+    for (i = 0; i < n; i += 4) {
+#if !defined(XCONJ) 
+        temp_r0 = alpha_r * src[0] - alpha_i * src[1];
+        temp_i0 = alpha_r * src[1] + alpha_i * src[0];
+        temp_r1 = alpha_r * src[2] - alpha_i * src[3];
+        temp_i1 = alpha_r * src[3] + alpha_i * src[2];
+        temp_r2 = alpha_r * src[4] - alpha_i * src[5];
+        temp_i2 = alpha_r * src[5] + alpha_i * src[4];
+        temp_r3 = alpha_r * src[6] - alpha_i * src[7];
+        temp_i3 = alpha_r * src[7] + alpha_i * src[6];
+#else
+        temp_r0 = alpha_r * src[0] + alpha_i * src[1];
+        temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
+        temp_r1 = alpha_r * src[2] + alpha_i * src[3];
+        temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
+        temp_r2 = alpha_r * src[4] + alpha_i * src[5];
+        temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
+        temp_r3 = alpha_r * src[6] + alpha_i * src[7];
+        temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
+#endif
+
+        dest[0] += temp_r0;
+        dest[1] += temp_i0;
+        dest[2] += temp_r1;
+        dest[3] += temp_i1;
+        dest[4] += temp_r2;
+        dest[5] += temp_i2;
+        dest[6] += temp_r3;
+        dest[7] += temp_i3;
+
+        src += 8;
+        dest += 8;
+    }
+    return;
+}
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+    FLOAT xbuffer[16] __attribute__((aligned(16)));
+    FLOAT *ybuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    ybuffer = buffer;
+
+    inc_x *= 2;
+    inc_y *= 2;
+    lda *= 2;
+
+    n1 = n / 8;
+    n2 = n % 8;
+
+    m3 = m % 4;
+    m1 = m - (m % 4);
+    m2 = (m % NBMAX) - (m % 4);
+
+    y_ptr = y;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        a_ptr = a;
+
+        x_ptr = x;
+        //zero_y(NB,ybuffer);
+        memset(ybuffer, 0, NB * 16);
+
+        if (inc_x == 2) {
+
+            for (i = 0; i < n1; i++) {
+                zgemv_kernel_4x8(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 3;
+                x_ptr += 16;
+            }
+            if (n2 & 4) {
+                zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 2;
+                x_ptr += 8;
+            }
+
+            if (n2 & 2) {
+                zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
+                x_ptr += 4;
+                a_ptr += 2 * lda;
+
+            }
+
+            if (n2 & 1) {
+                zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+                x_ptr += 2;
+                a_ptr += lda;
+
+            }
+        } else {
+
+            for (i = 0; i < n1; i++) {
+
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[2] = x_ptr[0];
+                xbuffer[3] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[4] = x_ptr[0];
+                xbuffer[5] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[6] = x_ptr[0];
+                xbuffer[7] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[8] = x_ptr[0];
+                xbuffer[9] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[10] = x_ptr[0];
+                xbuffer[11] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[12] = x_ptr[0];
+                xbuffer[13] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[14] = x_ptr[0];
+                xbuffer[15] = x_ptr[1];
+                x_ptr += inc_x;
+                zgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer);
+
+                a_ptr += lda << 3;
+            }
+            for (i = 0; i < n2; i++) {
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+                a_ptr += lda;
+
+            }
+
+        }
+
+        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+        a += 2 * NB;
+        y_ptr += NB * inc_y;
+    }
+
+    if (m3 == 0) return (0);
+
+    if (m3 == 1) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r = 0.0;
+        FLOAT temp_i = 0.0;
+
+        if (lda == 2 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += 2;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+        return (0);
+    }
+
+    if (m3 == 2) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+
+        if (lda == 4 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+#endif
+
+                a_ptr += 8;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+        return (0);
+    }
+
+    if (m3 == 3) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+        FLOAT temp_r2 = 0.0;
+        FLOAT temp_i2 = 0.0;
+
+        if (lda == 6 && inc_x == 2) {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += 6;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+        return (0);
+    }
+
+    return (0);
+}
+
diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index 956d75ffc..d3bf60ca7 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #elif HAVE_KERNEL_4x4_VEC
 
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector_quad acc0, acc1, acc2, acc3;;
+    __vector_quad acc4, acc5, acc6, acc7;
+    v4sf_t result[4];
+    __vector_pair *Va0, *Va1, *Va2, *Va3;
+    i = 0;
+    n = n << 1;
+    __builtin_mma_xxsetaccz (&acc0);
+    __builtin_mma_xxsetaccz (&acc1);
+    __builtin_mma_xxsetaccz (&acc2);
+    __builtin_mma_xxsetaccz (&acc3);
+    __builtin_mma_xxsetaccz (&acc4);
+    __builtin_mma_xxsetaccz (&acc5);
+    __builtin_mma_xxsetaccz (&acc6);
+    __builtin_mma_xxsetaccz (&acc7);
+    while (i < n) {
+
+	vec_t *rx = (vec_t *) & x[i];
+        Va0  = ((__vector_pair*)((void*)&a0[i]));
+        Va1  = ((__vector_pair*)((void*)&a1[i]));
+        Va2  = ((__vector_pair*)((void*)&a2[i]));
+        Va3  = ((__vector_pair*)((void*)&a3[i]));
+
+        __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]);
+        i += 8;
+
+    }
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    __builtin_mma_disassemble_acc ((void *)result, &acc0);
+    register FLOAT temp_r0 = result[0][0] - result[1][1];
+    register FLOAT temp_i0 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc4);
+    temp_r0 += result[2][0] - result[3][1];
+    temp_i0 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc1);
+    register FLOAT temp_r1 = result[0][0] - result[1][1];
+    register FLOAT temp_i1 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc5);
+    temp_r1 += result[2][0] - result[3][1];
+    temp_i1 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc2);
+    register FLOAT temp_r2 = result[0][0] - result[1][1];
+    register FLOAT temp_i2 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc6);
+    temp_r2 += result[2][0] - result[3][1];
+    temp_i2 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc3);
+    register FLOAT temp_r3 = result[0][0] - result[1][1];
+    register FLOAT temp_i3 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc7);
+    temp_r3 += result[2][0] - result[3][1];
+    temp_i3 += result[2][1] + result[3][0];
+#else
+    __builtin_mma_disassemble_acc ((void *)result, &acc0);
+    register FLOAT temp_r0 = result[0][0] + result[1][1];
+    register FLOAT temp_i0 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc4);
+    temp_r0 += result[2][0] + result[3][1];
+    temp_i0 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc1);
+    register FLOAT temp_r1 = result[0][0] + result[1][1];
+    register FLOAT temp_i1 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc5);
+    temp_r1 += result[2][0] + result[3][1];
+    temp_i1 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc2);
+    register FLOAT temp_r2 = result[0][0] + result[1][1];
+    register FLOAT temp_i2 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc6);
+    temp_r2 += result[2][0] + result[3][1];
+    temp_i2 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc3);
+    register FLOAT temp_r3 = result[0][0] + result[1][1];
+    register FLOAT temp_i3 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc7);
+    temp_r3 += result[2][0] + result[3][1];
+    temp_i3 += result[2][1] - result[3][0];
+#endif
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+#endif
+}
+#else
 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
     BLASLONG i;
     FLOAT *a0, *a1, *a2, *a3;
@@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
 #endif
 }
 
+#endif
 #else
 
 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 0068138e8..59ddc149f 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -43,12 +43,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 #include "zscal_microk_power8.c"
 #endif
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #if defined(DOUBLE)
 #include "zscal_microk_power10.c"
 #else
 #include "cscal_microk_power10.c"
 #endif
+#elif defined(POWER10)
+#if defined(DOUBLE)
+#include "zscal_microk_power8.c"
+#endif
 #endif
 #endif
 
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
index 6cd3d9664..908802b71 100644
--- a/kernel/power/zswap.c
+++ b/kernel/power/zswap.c
@@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "zswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "cswap_microk_power10.c"
+#elif defined(POWER10)
+#include "zswap_microk_power8.c"
 #endif
 #endif
 
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 5da79cc3f..bea7036c2 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -491,4 +491,3 @@ SSUMKERNEL = ../arm/sum.c
 DSUMKERNEL = ../arm/sum.c
 
 SOMATCOPY_RT = omatcopy_rt.c
-DOMATCOPY_RT = omatcopy_rt.c
diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c
index 72a87696e..cc5949b1a 100644
--- a/kernel/x86_64/drot_microk_haswell-2.c
+++ b/kernel/x86_64/drot_microk_haswell-2.c
@@ -1,6 +1,4 @@
-/* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
-
+#if defined(HAVE_FMA3)  && defined(HAVE_AVX2)
 #define HAVE_DROT_KERNEL 1
 
 #include <immintrin.h>
diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
index cc2ac5553..2588289d1 100644
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,10 +1,10 @@
 /* the direct sgemm code written by Arjan van der Ven */
-
+#include "common.h"
 
 #if defined(SKYLAKEX) || defined (COOPERLAKE)
 
 #include <immintrin.h>
-#include "common.h"
+
 
 /*
  * "Direct sgemm" code. This code operates directly on the inputs and outputs
@@ -472,7 +472,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG s
 	}
 }
 #else
-#include "common.h"
+
 void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {}
 #endif
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
index f3d614242..2db8b2fea 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
@@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
     BLASLONG n_count = n;
     float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+#if defined(__clang__)
+    for(;n_count>23;n_count-=24) COMPUTE(24)
+#else
     for(;n_count>23;n_count-=24) COMPUTE_n24
+#endif    
     for(;n_count>19;n_count-=20) COMPUTE(20)
     for(;n_count>15;n_count-=16) COMPUTE(16)
     for(;n_count>11;n_count-=12) COMPUTE(12)
diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index aedaa308d..ada1944b2 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -566,8 +566,8 @@ void LAPACK_cgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     lapack_complex_float const* AB, lapack_int const* ldab,
     lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    float* R,
-    float* C,
+    const float* R,
+    const float* C,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -585,8 +585,8 @@ void LAPACK_dgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     double const* AB, lapack_int const* ldab,
     double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    double* R,
-    double* C,
+    const double* R,
+    const double* C,
     double const* B, lapack_int const* ldb,
     double* X, lapack_int const* ldx,
     double* rcond,
@@ -604,8 +604,8 @@ void LAPACK_sgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     float const* AB, lapack_int const* ldab,
     float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    float* R,
-    float* C,
+    const float* R,
+    const float* C,
     float const* B, lapack_int const* ldb,
     float* X, lapack_int const* ldx,
     float* rcond,
@@ -623,8 +623,8 @@ void LAPACK_zgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     lapack_complex_double const* AB, lapack_int const* ldab,
     lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    double* R,
-    double* C,
+    const double* R,
+    const double* C,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -2941,6 +2941,42 @@ void LAPACK_zgetsls(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT)
+void LAPACK_cgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT)
+void LAPACK_dgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* ldt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT)
+void LAPACK_sgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* ldt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT)
+void LAPACK_zgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
 void LAPACK_cggbak(
     char const* job, char const* side,
@@ -4768,7 +4804,7 @@ void LAPACK_chegst(
     lapack_int const* itype, char const* uplo,
     lapack_int const* n,
     lapack_complex_float* A, lapack_int const* lda,
-    lapack_complex_float* B, lapack_int const* ldb,
+    const lapack_complex_float* B, lapack_int const* ldb,
     lapack_int* info );
 
 #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
@@ -4776,7 +4812,7 @@ void LAPACK_zhegst(
     lapack_int const* itype, char const* uplo,
     lapack_int const* n,
     lapack_complex_double* A, lapack_int const* lda,
-    lapack_complex_double* B, lapack_int const* ldb,
+    const lapack_complex_double* B, lapack_int const* ldb,
     lapack_int* info );
 
 #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
@@ -4913,7 +4949,7 @@ void LAPACK_cherfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_float const* A, lapack_int const* lda,
     lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -4931,7 +4967,7 @@ void LAPACK_zherfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_double const* A, lapack_int const* lda,
     lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -7251,6 +7287,24 @@ void LAPACK_sorgtr(
     float* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW)
+void LAPACK_dorgtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    double* A, lapack_int const* lda,
+    double const* T, lapack_int const* ldt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW)
+void LAPACK_sorgtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    float* A, lapack_int const* lda,
+    float const* T, lapack_int const* ldt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr(
     char const* vect, char const* side, char const* trans,
@@ -8005,7 +8059,7 @@ void LAPACK_cporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_float const* A, lapack_int const* lda,
     lapack_complex_float const* AF, lapack_int const* ldaf,
-    float* S,
+    const float* S,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -8023,7 +8077,7 @@ void LAPACK_dporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     double const* A, lapack_int const* lda,
     double const* AF, lapack_int const* ldaf,
-    double* S,
+    const double* S,
     double const* B, lapack_int const* ldb,
     double* X, lapack_int const* ldx,
     double* rcond,
@@ -8041,7 +8095,7 @@ void LAPACK_sporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     float const* A, lapack_int const* lda,
     float const* AF, lapack_int const* ldaf,
-    float* S,
+    const float* S,
     float const* B, lapack_int const* ldb,
     float* X, lapack_int const* ldx,
     float* rcond,
@@ -8059,7 +8113,7 @@ void LAPACK_zporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_double const* A, lapack_int const* lda,
     lapack_complex_double const* AF, lapack_int const* ldaf,
-    double* S,
+    const double* S,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -10756,7 +10810,7 @@ void LAPACK_csyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_float const* A, lapack_int const* lda,
     lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -10774,7 +10828,7 @@ void LAPACK_dsyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     double const* A, lapack_int const* lda,
     double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
     double const* B, lapack_int const* ldb,
     double* X, lapack_int const* ldx,
     double* rcond,
@@ -10792,7 +10846,7 @@ void LAPACK_ssyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     float const* A, lapack_int const* lda,
     float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
     float const* B, lapack_int const* ldb,
     float* X, lapack_int const* ldx,
     float* rcond,
@@ -10810,7 +10864,7 @@ void LAPACK_zsyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_double const* A, lapack_int const* lda,
     lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -11556,7 +11610,7 @@ void LAPACK_zsytrs(
 void LAPACK_csytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
     lapack_complex_float* B, lapack_int const* ldb,
     lapack_complex_float* work,
     lapack_int* info );
@@ -11565,7 +11619,7 @@ void LAPACK_csytrs2(
 void LAPACK_dsytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    const double* A, lapack_int const* lda, lapack_int const* ipiv,
     double* B, lapack_int const* ldb,
     double* work,
     lapack_int* info );
@@ -11574,7 +11628,7 @@ void LAPACK_dsytrs2(
 void LAPACK_ssytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    const float* A, lapack_int const* lda, lapack_int const* ipiv,
     float* B, lapack_int const* ldb,
     float* work,
     lapack_int* info );
@@ -11583,7 +11637,7 @@ void LAPACK_ssytrs2(
 void LAPACK_zsytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
     lapack_complex_double* B, lapack_int const* ldb,
     lapack_complex_double* work,
     lapack_int* info );
@@ -13540,6 +13594,24 @@ void LAPACK_zungtr(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW)
+void LAPACK_cungtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW)
+void LAPACK_zungtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr(
     char const* vect, char const* side, char const* trans,
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 012c104bb..5c129db91 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo,
 
 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int lda, const lapack_complex_float* b,
                            lapack_int ldb );
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int lda, const lapack_complex_double* b,
                            lapack_int ldb );
 
 lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
@@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a,
 lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a,
                            lapack_int lda, const double* tau );
 
+lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 float* a, lapack_int lda,
+                                 const float* t, lapack_int ldt );
+lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 double* a, lapack_int lda,
+                                 const double* t, lapack_int ldt );
+
 lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans,
                            lapack_int m, lapack_int n, lapack_int k,
                            const float* a, lapack_int lda, const float* tau,
@@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n,
                            lapack_complex_double* a, lapack_int lda,
                            const lapack_complex_double* tau );
 
+lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* t, lapack_int ldt );
+
 lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans,
                            lapack_int m, lapack_int n, lapack_int k,
                            const lapack_complex_float* a, lapack_int lda,
@@ -6932,11 +6950,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range,
 
 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int lda, const lapack_complex_float* b,
                                 lapack_int ldb );
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int lda, const lapack_complex_double* b,
                                 lapack_int ldb );
 
 lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz,
@@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n,
                                 double* a, lapack_int lda, const double* tau,
                                 double* work, lapack_int lwork );
 
+lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      float* a, lapack_int lda,
+                                      const float* t, lapack_int ldt,
+                                      float* work, lapack_int lwork );
+lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      double* a, lapack_int lda,
+                                      const double* t, lapack_int ldt,
+                                      double* work, lapack_int lwork );
+
 lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side,
                                 char trans, lapack_int m, lapack_int n,
                                 lapack_int k, const float* a, lapack_int lda,
@@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n,
                                 const lapack_complex_double* tau,
                                 lapack_complex_double* work, lapack_int lwork );
 
+lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_float* a, lapack_int lda,
+                                      const lapack_complex_float* t, lapack_int ldt,
+                                      lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_double* a, lapack_int lda,
+                                      const lapack_complex_double* t, lapack_int ldt,
+                                      lapack_complex_double* work, lapack_int lwork );
+
 lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side,
                                 char trans, lapack_int m, lapack_int n,
                                 lapack_int k, const lapack_complex_float* a,
@@ -10553,11 +10597,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv,
                                   lapack_complex_float* work, lapack_int nb );
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_float* a,
+                            lapack_int nrhs, const lapack_complex_float* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_float* b, lapack_int ldb );
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_float* a,
+                                 lapack_int nrhs, const lapack_complex_float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* work );
@@ -10718,10 +10762,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv, double* work,
                                   lapack_int nb );
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, double* a, lapack_int lda,
+                            lapack_int nrhs, const double* a, lapack_int lda,
                             const lapack_int* ipiv, double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, double* a,
+                                 lapack_int nrhs, const double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  double* b, lapack_int ldb, double* work );
 lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2,
@@ -10813,10 +10857,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv, float* work,
                                   lapack_int nb );
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, float* a, lapack_int lda,
+                            lapack_int nrhs, const float* a, lapack_int lda,
                             const lapack_int* ipiv, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, float* a,
+                                 lapack_int nrhs, const float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  float* b, lapack_int ldb, float* work );
 lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2,
@@ -10898,11 +10942,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv,
                                   lapack_complex_double* work, lapack_int nb );
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_double* a,
+                            lapack_int nrhs, const lapack_complex_double* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_double* b, lapack_int ldb );
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_double* a,
+                                 lapack_int nrhs, const lapack_complex_double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work );
@@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work, lapack_int lwork );
 
+lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               float* a, lapack_int lda,
+                               float* t, lapack_int ldt );
+lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               double* a, lapack_int lda,
+                               double* t, lapack_int ldt );
+lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    float* a, lapack_int lda,
+                                    float* t, lapack_int ldt,
+                                    float* work, lapack_int lwork );
+lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    double* a, lapack_int lda,
+                                    double* t, lapack_int ldt,
+                                    double* work, lapack_int lwork );
+lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_float* a, lapack_int lda,
+                                    lapack_complex_float* t, lapack_int ldt,
+                                    lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_double* a, lapack_int lda,
+                                    lapack_complex_double* t, lapack_int ldt,
+                                    lapack_complex_double* work, lapack_int lwork );
+
 lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
                           float* a, lapack_int lda, float* w );
 lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index a602dd7a0..7f827e1c9 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -162,6 +162,8 @@ lapacke_cgetrs.o \
 lapacke_cgetrs_work.o \
 lapacke_cgetsls.o \
 lapacke_cgetsls_work.o \
+lapacke_cgetsqrhrt.o \
+lapacke_cgetsqrhrt_work.o \
 lapacke_cggbak.o \
 lapacke_cggbak_work.o \
 lapacke_cggbal.o \
@@ -634,6 +636,8 @@ lapacke_cungrq.o \
 lapacke_cungrq_work.o \
 lapacke_cungtr.o \
 lapacke_cungtr_work.o \
+lapacke_cungtsqr_row.o \
+lapacke_cungtsqr_row_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@@ -778,6 +782,8 @@ lapacke_dgetrs.o \
 lapacke_dgetrs_work.o \
 lapacke_dgetsls.o \
 lapacke_dgetsls_work.o \
+lapacke_dgetsqrhrt.o \
+lapacke_dgetsqrhrt_work.o \
 lapacke_dggbak.o \
 lapacke_dggbak_work.o \
 lapacke_dggbal.o \
@@ -900,6 +906,8 @@ lapacke_dorgrq.o \
 lapacke_dorgrq_work.o \
 lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
+lapacke_dorgtsqr_row.o \
+lapacke_dorgtsqr_row_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \
 lapacke_sgetrs_work.o \
 lapacke_sgetsls.o \
 lapacke_sgetsls_work.o \
+lapacke_sgetsqrhrt.o \
+lapacke_sgetsqrhrt_work.o \
 lapacke_sggbak.o \
 lapacke_sggbak_work.o \
 lapacke_sggbal.o \
@@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \
 lapacke_sorgrq_work.o \
 lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
+lapacke_sorgtsqr_row.o \
+lapacke_sorgtsqr_row_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \
 lapacke_zgetrs_work.o \
 lapacke_zgetsls.o \
 lapacke_zgetsls_work.o \
+lapacke_zgetsqrhrt.o \
+lapacke_zgetsqrhrt_work.o \
 lapacke_zggbak.o \
 lapacke_zggbak_work.o \
 lapacke_zggbal.o \
@@ -2380,6 +2394,8 @@ lapacke_zungrq.o \
 lapacke_zungrq_work.o \
 lapacke_zungtr.o \
 lapacke_zungtr_work.o \
+lapacke_zungtsqr_row.o \
+lapacke_zungtsqr_row_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
index 558a7f308..4256c0f04 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
@@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_cgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_cgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
new file mode 100644
index 000000000..0e67e0b83
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
@@ -0,0 +1,80 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_complex_float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_C2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_float*)
+        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
new file mode 100644
index 000000000..598f193e6
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_float* a, lapack_int lda,
+                                    lapack_complex_float* t, lapack_int ldt,
+                                    lapack_complex_float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
index aa78e678e..dbb2753d1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
index d26c84785..2f25c187a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
index e8f212efb..9e8a1c4db 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else { 
             LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
index ff7dd3532..c628017c2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int lda, const lapack_complex_float* b,
                            lapack_int ldb )
 {
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
index a29e01961..001863819 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int lda, const lapack_complex_float* b,
                                 lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv.c b/lapack-netlib/LAPACKE/src/lapacke_chegv.c
index 15d052987..c01525662 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
index 537b9450b..fc3395833 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
index 98c901982..fe7b39cee 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
@@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
index 3ba62746e..d56e3ee46 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
index 6937752c4..fc0d4e3d2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
@@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
index 80d262626..eba359312 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
index 8c4c21935..4779f10d2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
@@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     float res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        lapack_complex_float* a_t = NULL;
         float* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_clantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (lapack_complex_float*)
-            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_clantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clascl.c b/lapack-netlib/LAPACKE/src/lapacke_clascl.c
index fdcb02947..4f4e0bf35 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_clascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_cgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_chb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
index 7b25815e7..1b4fed17a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_complex_float* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
index 2eb942e4e..771395e97 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
@@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
index 44405c993..f4a0a4334 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_float* a,
+                            lapack_int nrhs, const lapack_complex_float* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_float* b, lapack_int ldb )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
index 8567a07d5..d914c1d69 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_float* a,
+                                 lapack_int nrhs, const lapack_complex_float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* work )
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
index fd0a40c17..8ca652456 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
index c4ea703af..7b2e3a169 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
index ddae70345..faa3ef6d3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
@@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
new file mode 100644
index 000000000..bb551fcbc
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_complex_float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_C2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_float*)
+        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
new file mode 100644
index 000000000..96b18ab13
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
@@ -0,0 +1,109 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_float* a, lapack_int lda,
+                                      const lapack_complex_float* t, lapack_int ldt,
+                                      lapack_complex_float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
index d9fb2dca0..71ad23f2f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
@@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
         r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) {
             return -7;
         }
         if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
index 7dbc9bb88..671def1df 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
@@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_dgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_dgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
new file mode 100644
index 000000000..cf0e3200c
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
@@ -0,0 +1,79 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               double* a, lapack_int lda,
+                               double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
new file mode 100644
index 000000000..f91887ffe
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    double* a, lapack_int lda,
+                                    double* t, lapack_int ldt,
+                                    double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        double* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
index f1a505486..88f4489a3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
index 5b2a6c535..9c9b0ea8b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
@@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     double res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        double* a_t = NULL;
         double* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_dlantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_dlantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
index 5b579a5d1..058105127 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_dlascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_dgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_dsb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
index 4b59fe627..f1444b5e2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         double* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
index 86184b784..587805de6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
new file mode 100644
index 000000000..1da3405a8
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
@@ -0,0 +1,82 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 double* a, lapack_int lda,
+                                 const double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
new file mode 100644
index 000000000..e16467f3a
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      double* a, lapack_int lda,
+                                      const double* t, lapack_int ldt,
+                                      double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        double* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
index db75a6609..0b1c54b9b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
@@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
         r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) {
             return -7;
         }
         if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
index cca9be489..36ff7c40c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
index f696c608f..78f9e80ed 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
@@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
index 6f9c02f6a..d68989aa6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
index 81ba2acb3..25d075d46 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
index 800a30b24..69b90e758 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo,
         if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
index 533b6a446..4ece69794 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
index 974b63e54..0016a7d06 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
index 51f333359..0db0cfa67 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
@@ -51,10 +51,10 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c
index 02d54d7fa..54fa6ff36 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_dsygvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
index 4d73ef3c1..46c90190f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, double* a, lapack_int lda,
+                            lapack_int nrhs, const double* a, lapack_int lda,
                             const lapack_int* ipiv, double* b, lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
index caffa5b4b..c937c39c5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, double* a,
+                                 lapack_int nrhs, const double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  double* b, lapack_int ldb, double* work )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c
index 66d1e5a2c..de379a970 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_dtrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c
index 89f01dc95..d17593471 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_dtrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c
index 9dc5509c9..941d83cad 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c
@@ -54,6 +54,8 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -71,7 +73,7 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_sgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_sgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c
new file mode 100644
index 000000000..759afce48
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c
@@ -0,0 +1,79 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               float* a, lapack_int lda,
+                               float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    float* work = NULL;
+    float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+        /* Allocate memory for work arrays */
+    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c
new file mode 100644
index 000000000..40193008d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    float* a, lapack_int lda,
+                                    float* t, lapack_int ldt,
+                                    float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        float* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c
index e60167001..cdec2c967 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_slacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_slacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
index e1d4c270d..f77abef2c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
@@ -40,44 +40,46 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     float res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        float* a_t = NULL;
         float* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_slantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_slantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_slantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slascl.c b/lapack-netlib/LAPACKE/src/lapacke_slascl.c
index 25bd9624e..62f7390ed 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_slascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_sgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_ssb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c
index c89c9a6e1..4f2fa7b67 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_slaset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_slaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         float* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c
index 90dc435c9..804b7f8ef 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_s_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c
new file mode 100644
index 000000000..350783a78
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c
@@ -0,0 +1,82 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 float* a, lapack_int lda,
+                                 const float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    float* work = NULL;
+    float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c
new file mode 100644
index 000000000..a66f70b52
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      float* a, lapack_int lda,
+                                      const float* t, lapack_int ldt,
+                                      float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        float* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_sge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c
index 9f0e9fddf..6ffe144cc 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c
@@ -51,7 +51,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans,
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
         r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_sge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, r, a, lda ) ) {
             return -7;
         }
         if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c
index 5fd0a78c5..ac41a354d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_ssyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
index abd62ddf3..1889a337c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
@@ -72,7 +72,7 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
index d9fe47599..faadc92f1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
index bfbf49aee..434b52c01 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c
index 7b97f472b..4fb55960c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_ssygst( int matrix_layout, lapack_int itype, char uplo,
         if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c
index 8ec40d954..f139de1ab 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c
index a2eba6653..195fb1e54 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
index 5afe8d2de..e33ce2a7b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
@@ -51,10 +51,10 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c
index 1fe4e2c6c..8ffd9dc40 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_ssygvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
index 19f447cd8..a95a71469 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, float* a, lapack_int lda,
+                            lapack_int nrhs, const float* a, lapack_int lda,
                             const lapack_int* ipiv, float* b, lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
index 7d348b382..cf98f443d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, float* a,
+                                 lapack_int nrhs, const float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  float* b, lapack_int ldb, float* work )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttf.c b/lapack-netlib/LAPACKE/src/lapacke_strttf.c
index fee7ab9ae..e3304fbe7 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_strttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_strttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_strttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttp.c b/lapack-netlib/LAPACKE/src/lapacke_strttp.c
index 6c4b84aa3..2df79eb05 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_strttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_strttp.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_strttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c
index 2d7c2b6f3..da73cd479 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c
@@ -56,6 +56,8 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -73,7 +75,7 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_zgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_zgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c
new file mode 100644
index 000000000..53557c92d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c
@@ -0,0 +1,80 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_double* work = NULL;
+    lapack_complex_double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_Z2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_double*)
+        LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c
new file mode 100644
index 000000000..a6825df56
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_double* a, lapack_int lda,
+                                    lapack_complex_double* t, lapack_int ldt,
+                                    lapack_complex_double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_double* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
index d4e93aed2..8b7aa3518 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
index fb33c3e2a..840c53876 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
index 5af2a1269..b8509e04f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else { 
             LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
index 8c4a5c374..aa2d84d84 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int lda, const lapack_complex_double* b,
                            lapack_int ldb )
 {
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
index 62fce1f27..f77894204 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int lda, const lapack_complex_double* b,
                                 lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c
index 683fcf487..587e2d4be 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c
index 0f1b415a9..43569d99e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
index 1242a0eda..c287595ad 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
@@ -55,10 +55,10 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c
index 492bc4dad..83f2bda2e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c
@@ -61,7 +61,7 @@ lapack_int LAPACKE_zhegvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c
index a07bc8d52..15a8cc576 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c
@@ -46,7 +46,7 @@ lapack_int LAPACKE_zhetri2x( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c
index bb4e57b1e..fe36ed811 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_zlacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_zlacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
index e62f8a4e3..cccc4053e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
@@ -41,45 +41,46 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     double res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        lapack_complex_double* a_t = NULL;
         double* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_zlantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (lapack_complex_double*)
-            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_zlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_zlantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c
index 7e37d559c..8bf1ee767 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_zlascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_zgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_zhb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c
index 9056e8fca..ecb6cba25 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_zlaset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_zlaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_complex_double* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c
index 2826efa53..074b15303 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c
@@ -45,7 +45,7 @@ lapack_int LAPACKE_zsyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
index 7442702aa..3c85f9796 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_double* a,
+                            lapack_int nrhs, const lapack_complex_double* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_double* b, lapack_int ldb )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
index ec05ce6d5..cdc97fa02 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
                                  lapack_int nrhs,
-                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* a, lapack_int lda,
                                  const lapack_int* ipiv,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work )
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c
index 8a5dfc271..8e8789ec6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c
index 5dcf633bb..bd8485108 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c
index 51785347e..adfaa7db9 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c
@@ -48,7 +48,7 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_z_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c
new file mode 100644
index 000000000..71418fb84
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_double* work = NULL;
+    lapack_complex_double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zungtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_Z2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_double*)
+        LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zungtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c
new file mode 100644
index 000000000..909855864
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c
@@ -0,0 +1,109 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_double* a, lapack_int lda,
+                                      const lapack_complex_double* t, lapack_int ldt,
+                                      lapack_complex_double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_double* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_zge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 83baac875..d1ee96667 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -135,14 +135,14 @@ SLASRC_O = \
    slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
-   slarf.o  slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
+   slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
    slarrv.o slartv.o \
    slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
    slasyf_rk.o \
    slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \
    slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \
    sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \
-   sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \
+   sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \
    sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \
    sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \
    spbstf.o spbsv.o  spbsvx.o \
@@ -181,7 +181,7 @@ SLASRC_O = \
    sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \
    stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \
    sgelqt.o sgelqt3.o sgemlqt.o \
-   sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
+   sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
    sgelq.o slaswlq.o slamswlq.o sgemlq.o \
    stplqt.o stplqt2.o stpmlqt.o \
    sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \
@@ -250,7 +250,7 @@ CLASRC_O = \
    claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
-   clarf.o  clarfb.o clarfg.o clarft.o clarfgp.o \
+   clarf.o  clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
    clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
    clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
    claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@@ -278,7 +278,7 @@ CLASRC_O = \
    ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
    ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \
    cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
-   cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
+   cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
    cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
    cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \
    chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \
@@ -289,7 +289,7 @@ CLASRC_O = \
    cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \
    ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o \
    cgelqt.o cgelqt3.o cgemlqt.o \
-   cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
+   cgetsls.o cgetsqrhrt.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
    cgelq.o claswlq.o clamswlq.o cgemlq.o \
    ctplqt.o ctplqt2.o ctpmlqt.o \
    cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \
@@ -342,14 +342,14 @@ DLASRC_O = \
    dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
-   dlarf.o  dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
+   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
    dlargv.o dlarrv.o dlartv.o \
    dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
    dlasyf.o dlasyf_rook.o dlasyf_rk.o \
    dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \
    dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \
    dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \
-   dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \
+   dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \
    dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \
    dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \
    dpbstf.o dpbsv.o  dpbsvx.o \
@@ -389,7 +389,7 @@ DLASRC_O = \
    dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \
    dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \
    dgelqt.o dgelqt3.o dgemlqt.o \
-   dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
+   dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
    dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
    dtplqt.o dtplqt2.o dtpmlqt.o \
    dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \
@@ -455,7 +455,7 @@ ZLASRC_O = \
    zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
-   zlarcm.o zlarf.o  zlarfb.o \
+   zlarcm.o zlarf.o  zlarfb.o zlarfb_gett.o \
    zlarfg.o zlarft.o zlarfgp.o \
    zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
    zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
@@ -484,7 +484,7 @@ ZLASRC_O = \
    ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \
    ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \
    zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
-   zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
+   zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
    zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \
    zunmtr.o zupgtr.o \
    zupmtr.o izmax1.o dzsum1.o zstemr.o \
@@ -498,7 +498,7 @@ ZLASRC_O = \
    ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \
    ztplqt.o ztplqt2.o ztpmlqt.o \
    zgelqt.o zgelqt3.o zgemlqt.o \
-   zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
+   zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
    zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
    zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f
index 07341593f..34a80beea 100644
--- a/lapack-netlib/SRC/cgesdd.f
+++ b/lapack-netlib/SRC/cgesdd.f
@@ -281,9 +281,9 @@
      $                   CUNGQR, CUNMBR, SBDSDC, SLASCL, XERBLA
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, SISNAN
       REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, SLAMCH, CLANGE
+      EXTERNAL           LSAME, SLAMCH, CLANGE, SISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -647,6 +647,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = CLANGE( 'M', M, N, A, LDA, DUM )
+      IF( SISNAN ( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f
new file mode 100644
index 000000000..4e4dc1d4a
--- /dev/null
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b CGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a complex M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in CGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of CGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup comlpexOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for CLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for CUNGTSQR_ROW;
+*        d) Diagonal D for CUNHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of CUNGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
+            DO J = I, N
+               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = CMPLX( LWORKOPT )
+      RETURN
+*
+*     End of CGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f
index 336f41909..9c8e0eec3 100644
--- a/lapack-netlib/SRC/cggglm.f
+++ b/lapack-netlib/SRC/cggglm.f
@@ -271,8 +271,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = CZERO
+         END DO
+         DO I = 1, P
+            Y(I) = CZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f
index 4725e7169..bcf5acd0b 100644
--- a/lapack-netlib/SRC/chgeqz.f
+++ b/lapack-netlib/SRC/chgeqz.f
@@ -319,14 +319,14 @@
       REAL               ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
      $                   C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
       COMPLEX            ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
-     $                   CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
+     $                   CTEMP3, ESHIFT, S, SHIFT, SIGNBC,
      $                   U12, X, ABI12, Y
 *     ..
 *     .. External Functions ..
       COMPLEX            CLADIV
       LOGICAL            LSAME
       REAL               CLANHS, SLAMCH
-      EXTERNAL           CLADIV, LLSAME, CLANHS, SLAMCH
+      EXTERNAL           CLADIV, LSAME, CLANHS, SLAMCH
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARTG, CLASET, CROT, CSCAL, XERBLA
@@ -351,6 +351,7 @@
          ILSCHR = .TRUE.
          ISCHUR = 2
       ELSE
+         ILSCHR = .TRUE.
          ISCHUR = 0
       END IF
 *
@@ -364,6 +365,7 @@
          ILQ = .TRUE.
          ICOMPQ = 3
       ELSE
+         ILQ = .TRUE.
          ICOMPQ = 0
       END IF
 *
@@ -377,6 +379,7 @@
          ILZ = .TRUE.
          ICOMPZ = 3
       ELSE
+         ILZ = .TRUE.
          ICOMPZ = 0
       END IF
 *
diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f
index cfcf725b2..32b6fa87b 100644
--- a/lapack-netlib/SRC/chseqr.f
+++ b/lapack-netlib/SRC/chseqr.f
@@ -320,10 +320,10 @@
 *     .    CLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare CLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare CLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f
index 2f0ea20db..233721352 100644
--- a/lapack-netlib/SRC/claqr0.f
+++ b/lapack-netlib/SRC/claqr0.f
@@ -260,7 +260,7 @@
 *     .    CLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -355,22 +355,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -418,7 +418,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -558,7 +558,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use CLAQR4 or
 *                 .    CLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -659,7 +659,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f
index fba286df7..94484e798 100644
--- a/lapack-netlib/SRC/claqr4.f
+++ b/lapack-netlib/SRC/claqr4.f
@@ -270,7 +270,7 @@
 *     .    CLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -365,22 +365,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -428,7 +428,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -568,7 +568,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use CLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -663,7 +663,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f
index e4317a3ad..71f26d8c9 100644
--- a/lapack-netlib/SRC/claqr5.f
+++ b/lapack-netlib/SRC/claqr5.f
@@ -69,10 +69,9 @@
 *>             matrix entries.
 *>        = 1: CLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: CLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -170,14 +169,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is COMPLEX array, dimension (LDU,3*NSHFTS-3)
+*>          U is COMPLEX array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -189,7 +188,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is COMPLEX array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is COMPLEX array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -215,7 +214,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -226,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup complexOTHERauxiliary
 *
@@ -235,6 +234,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -244,10 +248,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
      $                   H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV,
      $                   WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -276,11 +285,11 @@
       COMPLEX            ALPHA, BETA, CDUM, REFSUM
       REAL               H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
      $                   SMLNUM, TST1, TST2, ULP
-      INTEGER            I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -334,10 +343,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -349,28 +354,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL CLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -379,24 +395,156 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 10 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
+     $                         S( 2*M22 ), V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*CONJG( V( 2, M22 ) )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = CONJG( V( 1, M22 ) )*
+     $                     ( H( K+1, J )+CONJG( V( 2, M22 ) )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
+                     IF( TST1.EQ.RZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + CABS1( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + CABS1( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + CABS1( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + CABS1( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + CABS1( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + CABS1( H( K+4, K+1 ) )
+                     END IF
+                     IF( CABS1( H( K+1, K ) )
+     $                   .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN
+                        H12 = MAX( CABS1( H( K+1, K ) ),
+     $                        CABS1( H( K, K+1 ) ) )
+                        H21 = MIN( CABS1( H( K+1, K ) ),
+     $                        CABS1( H( K, K+1 ) ) )
+                        H11 = MAX( CABS1( H( K+1, K+1 ) ),
+     $                        CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( CABS1( H( K+1, K+1 ) ),
+     $                        CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*CONJG( V( 2, M22 ) )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*CONJG( V( 2, M22 ) )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL CLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ),
      $                         S( 2*M ), V( 1, M ) )
                   ALPHA = V( 1, M )
                   CALL CLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) -
+     $                            REFSUM*CONJG( V( 3, M ) )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL CLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -444,7 +592,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -458,163 +606,32 @@
                      END IF
                   END IF
                END IF
-   10       CONTINUE
 *
-*           ==== Generate a 2-by-2 reflection, if needed. ====
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====
 *
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
-     $                         S( 2*M22 ), V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*CONJG( V( 2, M ) )
+                  H( J, K+3 ) = H( J, K+3 ) -
+     $                          REFSUM*CONJG( V( 3, M ) )
+   70          CONTINUE
 *
-*           ==== Multiply H by reflections from the left ====
+*              ==== Perform update from left for subsequent
+*              .    column. ====
 *
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 30 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 20 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = CONJG( V( 1, M ) )*
-     $                     ( H( K+1, J )+CONJG( V( 2, M ) )*H( K+2, J )+
-     $                     CONJG( V( 3, M ) )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   20          CONTINUE
-   30       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 40 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = CONJG( V( 1, M22 ) )*
-     $                     ( H( K+1, J )+CONJG( V( 2, M22 ) )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   40          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 80 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 50 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*CONJG( V( 2, M ) )
-                     H( J, K+3 ) = H( J, K+3 ) -
-     $                             REFSUM*CONJG( V( 3, M ) )
-   50             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 60 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*CONJG( V( 2, M ) )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) -
-     $                                  REFSUM*CONJG( V( 3, M ) )
-   60                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 70 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*CONJG( V( 2, M ) )
-                        Z( J, K+3 ) = Z( J, K+3 ) -
-     $                                REFSUM*CONJG( V( 3, M ) )
-   70                CONTINUE
-                  END IF
-               END IF
-   80       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 90 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*CONJG( V( 2, M22 ) )
-   90             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 100 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*CONJG( V( 2, M22 ) )
-  100                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 110 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*CONJG( V( 2, M22 ) )
-  110                CONTINUE
-                  END IF
-               END IF
-            END IF
-*
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 120 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+               REFSUM =  CONJG( V( 1, M ) )*( H( K+1, K+1 )
+     $                  +CONJG( V( 2, M ) )*H( K+2, K+1 )
+     $                  +CONJG( V( 3, M ) )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -625,6 +642,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
                   IF( TST1.EQ.RZERO ) THEN
@@ -658,22 +677,77 @@
      $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
                   END IF
                END IF
-  120       CONTINUE
+   80       CONTINUE
 *
-*           ==== Fill in the last row of each bulge. ====
+*           ==== Multiply H by reflections from the left ====
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 130 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*CONJG( V( 2, M ) )
-               H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*CONJG( V( 3, M ) )
-  130       CONTINUE
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = CONJG( V( 1, M ) )*
+     $                     ( H( K+1, J )+CONJG( V( 2, M ) )*
+     $                     H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
+*
+*           ==== Accumulate orthogonal transformations. ====
+*
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*CONJG( V( 2, M ) )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) -
+     $                               REFSUM*CONJG( V( 3, M ) )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*CONJG( V( 2, M ) )
+                     Z( J, K+3 ) = Z( J, K+3 ) -
+     $                             REFSUM*CONJG( V( 3, M ) )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  140    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -687,220 +761,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
 *
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
+*           ==== Horizontal Multiply ====
 *
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+     $                     LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+     $                     LDWH )
+               CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH,
+     $                      H( INCOL+K1, JCOL ), LDH )
+  150       CONTINUE
 *
-*              ==== Horizontal Multiply ====
+*           ==== Vertical multiply ====
 *
-               DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
-     $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
-     $                        LDWH )
-                  CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH,
-     $                         H( INCOL+K1, JCOL ), LDH )
-  150          CONTINUE
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
 *
-*              ==== Vertical multiply ====
+*           ==== Z multiply (also vertical) ====
 *
-               DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
-  160          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 170 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  170             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL CLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL CLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL CTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**H ====
-*
-                  CALL CGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL CLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL CTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL CGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL CLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  180          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL CLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL CLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  190          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 200 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL CLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL CLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  200             CONTINUE
-               END IF
+     $                         Z( JROW, INCOL+K1 ), LDZ )
+  170          CONTINUE
             END IF
          END IF
-  210 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of CLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f
new file mode 100644
index 000000000..ee6959ed8
--- /dev/null
+++ b/lapack-netlib/SRC/clarfb_gett.f
@@ -0,0 +1,597 @@
+*> \brief \b CLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFB_GETT applies a complex Householder block reflector H from the
+*> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**H)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*A1_in
+*>
+*>       B1_out: = - V2*T*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**H) * W2
+*>                      = (unit_lower_tr_of_(A1)**H) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**H) * W1
+*>                    = (unit_lower_tr_of_(A1)**H) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CGEMM, CTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
+     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = CZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of CLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/ctgsja.f b/lapack-netlib/SRC/ctgsja.f
index 38a61068e..c96cbe022 100644
--- a/lapack-netlib/SRC/ctgsja.f
+++ b/lapack-netlib/SRC/ctgsja.f
@@ -401,7 +401,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      REAL               ZERO, ONE
+      REAL               ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
       COMPLEX            CZERO, CONE
       PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
@@ -424,7 +424,8 @@
      $                   SLARTG, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, CONJG, MAX, MIN, REAL
+      INTRINSIC          ABS, CONJG, MAX, MIN, REAL, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -610,9 +611,9 @@
 *
          A1 = REAL( A( K+I, N-L+I ) )
          B1 = REAL( B( I, N-L+I ) )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
             IF( GAMMA.LT.ZERO ) THEN
                CALL CSSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB )
diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f
index df25799ca..0dddd42a6 100644
--- a/lapack-netlib/SRC/cungbr.f
+++ b/lapack-netlib/SRC/cungbr.f
@@ -222,8 +222,8 @@
                CALL CUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL CUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL CUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -231,8 +231,8 @@
                CALL CUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL CUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL CUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/cungtsqr_row.f b/lapack-netlib/SRC/cungtsqr_row.f
new file mode 100644
index 000000000..e1597c58b
--- /dev/null
+++ b/lapack-netlib/SRC/cungtsqr_row.f
@@ -0,0 +1,380 @@
+*> \brief \b CUNGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CUNGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
+*> orthonormal columns from the output of CLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by CLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of CLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine CLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which CLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by CLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by CLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by CLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See CLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See CLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      COMPLEX            DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLARFB_GETT, CLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CUNGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL CLASET('U', M, N, CZERO, CONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = CMPLX( LWORKOPT )
+      RETURN
+*
+*     End of CUNGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f
index 0218900d2..80d18041c 100644
--- a/lapack-netlib/SRC/dgesdd.f
+++ b/lapack-netlib/SRC/dgesdd.f
@@ -267,9 +267,9 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, DISNAN
       DOUBLE PRECISION   DLAMCH, DLANGE
-      EXTERNAL           DLAMCH, DLANGE, LSAME
+      EXTERNAL           DLAMCH, DLANGE, LSAME, DISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -599,6 +599,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = DLANGE( 'M', M, N, A, LDA, DUM )
+      IF( DISNAN( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f
new file mode 100644
index 000000000..668deeba8
--- /dev/null
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b DGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a real M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in DGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of DGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for DLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for DORGTSQR_ROW;
+*        d) Diagonal D for DORHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of DORGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
+            DO J = I, N
+               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = DBLE( LWORKOPT )
+      RETURN
+*
+*     End of DGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f
index 2e92912e0..1fbdc8add 100644
--- a/lapack-netlib/SRC/dggglm.f
+++ b/lapack-netlib/SRC/dggglm.f
@@ -270,8 +270,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = ZERO
+         END DO
+         DO I = 1, P
+            Y(I) = ZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f
index b4fc3af90..6b7fb308f 100644
--- a/lapack-netlib/SRC/dhseqr.f
+++ b/lapack-netlib/SRC/dhseqr.f
@@ -338,10 +338,10 @@
 *     .    DLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare DLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare DLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f
index 61b016f16..1c277c6bb 100644
--- a/lapack-netlib/SRC/dlanv2.f
+++ b/lapack-netlib/SRC/dlanv2.f
@@ -139,7 +139,7 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION   ZERO, HALF, ONE
+      DOUBLE PRECISION   ZERO, HALF, ONE, TWO
       PARAMETER          ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0,
      $                     TWO = 2.0D0 )
       DOUBLE PRECISION   MULTPL
diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f
index f362c096c..8334d8d2b 100644
--- a/lapack-netlib/SRC/dlaqr0.f
+++ b/lapack-netlib/SRC/dlaqr0.f
@@ -278,7 +278,7 @@
 *     .    DLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -362,22 +362,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -425,7 +425,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -576,7 +576,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use DLAQR4 or
 *                 .    DLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -698,7 +698,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f
index 454bf9608..163e55deb 100644
--- a/lapack-netlib/SRC/dlaqr4.f
+++ b/lapack-netlib/SRC/dlaqr4.f
@@ -284,7 +284,7 @@
 *     .    DLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -368,22 +368,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -431,7 +431,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -582,7 +582,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use DLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -697,7 +697,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f
index f58db9c89..12e7db637 100644
--- a/lapack-netlib/SRC/dlaqr5.f
+++ b/lapack-netlib/SRC/dlaqr5.f
@@ -70,10 +70,9 @@
 *>             matrix entries.
 *>        = 1: DLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: DLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -178,14 +177,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is DOUBLE PRECISION array, dimension (LDU,3*NSHFTS-3)
+*>          U is DOUBLE PRECISION array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -197,7 +196,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is DOUBLE PRECISION array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is DOUBLE PRECISION array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -223,7 +222,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -234,7 +233,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup doubleOTHERauxiliary
 *
@@ -243,6 +242,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -252,10 +256,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
      $                   SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
      $                   LDU, NV, WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -282,11 +291,11 @@
       DOUBLE PRECISION   ALPHA, BETA, H11, H12, H21, H22, REFSUM,
      $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
      $                   ULP
-      INTEGER            I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -356,10 +365,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -371,28 +376,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -401,17 +417,134 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 20 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
+     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
+     $                         V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP ) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
+                     IF( TST1.EQ.ZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + ABS( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + ABS( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + ABS( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + ABS( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + ABS( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + ABS( H( K+4, K+1 ) )
+                     END IF
+                     IF( ABS( H( K+1, K ) )
+     $                   .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN
+                        H12 = MAX( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H21 = MIN( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H11 = MAX( ABS( H( K+1, K+1 ) ),
+     $                             ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( ABS( H( K+1, K+1 ) ),
+     $                        ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) ) THEN
+                           H( K+1, K ) = ZERO
+                        END IF
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ),
      $                         SI( 2*M-1 ), SR( 2*M ), SI( 2*M ),
@@ -419,7 +552,20 @@
                   ALPHA = V( 1, M )
                   CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*V( 2, M )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -467,7 +613,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -481,154 +627,29 @@
                      END IF
                   END IF
                END IF
-   20       CONTINUE
 *
-*           ==== Generate a 2-by-2 reflection, if needed. ====
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====      
 *
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
-     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
-     $                         V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
+   70          CONTINUE
 *
-*           ==== Multiply H by reflections from the left ====
+*              ==== Perform update from left for subsequent
+*              .    column. ====
 *
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 40 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 30 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
-     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   30          CONTINUE
-   40       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 50 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   50          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 90 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 60 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
-                     H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
-   60             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 70 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
-   70                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 80 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
-                        Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
-   80                CONTINUE
-                  END IF
-               END IF
-   90       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 100 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
-  100             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 110 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*V( 2, M22 )
-  110             CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 120 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
-  120                CONTINUE
-                  END IF
-               END IF
-            END IF
-*
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 130 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+               REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
+     $                  H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -639,6 +660,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
                   IF( TST1.EQ.ZERO ) THEN
@@ -667,25 +690,77 @@
                      TST2 = H22*( H11 / SCL )
 *
                      IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
-     $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+     $                   MAX( SMLNUM, ULP*TST2 ) ) THEN
+                        H( K+1, K ) = ZERO
+                     END IF
                   END IF
                END IF
-  130       CONTINUE
+   80       CONTINUE
 *
-*           ==== Fill in the last row of each bulge. ====
+*           ==== Multiply H by reflections from the left ====
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 140 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*V( 2, M )
-               H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M )
-  140       CONTINUE
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
+     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
+*
+*           ==== Accumulate orthogonal transformations. ====
+*
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  150    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -699,220 +774,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
 *
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
+*           ==== Horizontal Multiply ====
 *
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
-*
-*              ==== Horizontal Multiply ====
-*
-               DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
      $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
      $                        LDWH )
-                  CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH,
+               CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH,
      $                         H( INCOL+K1, JCOL ), LDH )
-  160          CONTINUE
+  150       CONTINUE
 *
-*              ==== Vertical multiply ====
+*           ==== Vertical multiply ====
 *
-               DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
+*
+*           ==== Z multiply (also vertical) ====
+*
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
+     $                         Z( JROW, INCOL+K1 ), LDZ )
   170          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 180 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  180             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**T ====
-*
-                  CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  190          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  200          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 210 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL DLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  210             CONTINUE
-               END IF
             END IF
          END IF
-  220 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of DLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f
new file mode 100644
index 000000000..10ab6461e
--- /dev/null
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@@ -0,0 +1,596 @@
+*> \brief \b DLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFB_GETT applies a real Householder block reflector H from the
+*> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**T)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T**A1_in
+*>
+*>       B1_out: = - V2*T**A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**T) * W2
+*>                      = (unit_lower_tr_of_(A1)**T) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**T) * W1
+*>                    = (unit_lower_tr_of_(A1)**T) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGEMM, DTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
+     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = ZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of DLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/dlasq2.f b/lapack-netlib/SRC/dlasq2.f
index 68d922870..27eb1f79a 100644
--- a/lapack-netlib/SRC/dlasq2.f
+++ b/lapack-netlib/SRC/dlasq2.f
@@ -184,10 +184,18 @@
 *
 *        2-by-2 case.
 *
-         IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN
-            INFO = -2
+         IF( Z( 1 ).LT.ZERO ) THEN
+            INFO = -201
             CALL XERBLA( 'DLASQ2', 2 )
             RETURN
+         ELSE IF( Z( 2 ).LT.ZERO ) THEN
+            INFO = -202
+            CALL XERBLA( 'DLASQ2', 2 )
+            RETURN
+         ELSE IF( Z( 3 ).LT.ZERO ) THEN
+           INFO = -203
+           CALL XERBLA( 'DLASQ2', 2 )
+           RETURN
          ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN
             D = Z( 3 )
             Z( 3 ) = Z( 1 )
diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f
index cfebda5ab..6868fc38d 100644
--- a/lapack-netlib/SRC/dorgbr.f
+++ b/lapack-netlib/SRC/dorgbr.f
@@ -221,8 +221,8 @@
                CALL DORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL DORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL DORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -230,8 +230,8 @@
                CALL DORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL DORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL DORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/dorgtsqr_row.f b/lapack-netlib/SRC/dorgtsqr_row.f
new file mode 100644
index 000000000..94f8b0120
--- /dev/null
+++ b/lapack-netlib/SRC/dorgtsqr_row.f
@@ -0,0 +1,379 @@
+*> \brief \b DORGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DORGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dorgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DORGTSQR_ROW generates an M-by-N real matrix Q_out with
+*> orthonormal columns from the output of DLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by DLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of DLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine DLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which DLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by DLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by DLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by DLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See DLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See DLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLARFB_GETT, DLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DORGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL DLASET('U', M, N, ZERO, ONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = DBLE( LWORKOPT )
+      RETURN
+*
+*     End of DORGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/SRC/dtgsja.f b/lapack-netlib/SRC/dtgsja.f
index 66f32b790..537bd3f4f 100644
--- a/lapack-netlib/SRC/dtgsja.f
+++ b/lapack-netlib/SRC/dtgsja.f
@@ -400,7 +400,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      DOUBLE PRECISION   ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
@@ -419,7 +419,8 @@
      $                   DSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, MAX, MIN
+      INTRINSIC          ABS, MAX, MIN, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -596,9 +597,9 @@
 *
          A1 = A( K+I, N-L+I )
          B1 = B( I, N-L+I )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
 *           change sign if necessary
 *
diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f
index 689494dd1..89e03a002 100644
--- a/lapack-netlib/SRC/sgesdd.f
+++ b/lapack-netlib/SRC/sgesdd.f
@@ -267,9 +267,9 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, SISNAN
       REAL               SLAMCH, SLANGE
-      EXTERNAL           SLAMCH, SLANGE, LSAME
+      EXTERNAL           SLAMCH, SLANGE, LSAME, SISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -599,6 +599,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = SLANGE( 'M', M, N, A, LDA, DUM )
+      IF( SISNAN( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f
new file mode 100644
index 000000000..f9580da7b
--- /dev/null
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b SGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a complex M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in SGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of SGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup singleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for SLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for SORGTSQR_ROW;
+*        d) Diagonal D for SORHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of SORGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
+            DO J = I, N
+               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = REAL( LWORKOPT )
+      RETURN
+*
+*     End of SGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f
index fe63da5f5..572ee511d 100644
--- a/lapack-netlib/SRC/sggglm.f
+++ b/lapack-netlib/SRC/sggglm.f
@@ -270,8 +270,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = ZERO
+         END DO
+         DO I = 1, P
+            Y(I) = ZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f
index b5707f2c3..d22bd7b94 100644
--- a/lapack-netlib/SRC/shseqr.f
+++ b/lapack-netlib/SRC/shseqr.f
@@ -338,10 +338,10 @@
 *     .    SLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare SLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare SLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f
index e678305f2..375645b75 100644
--- a/lapack-netlib/SRC/slanv2.f
+++ b/lapack-netlib/SRC/slanv2.f
@@ -139,7 +139,7 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL               ZERO, HALF, ONE
+      REAL               ZERO, HALF, ONE, TWO
       PARAMETER          ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0,
      $                     TWO = 2.0E+0 )
       REAL               MULTPL
diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f
index 318b46943..b1ebaff75 100644
--- a/lapack-netlib/SRC/slaqr0.f
+++ b/lapack-netlib/SRC/slaqr0.f
@@ -277,7 +277,7 @@
 *     .    SLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -361,22 +361,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -424,7 +424,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -575,7 +575,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use SLAQR4 or
 *                 .    SLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -697,7 +697,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f
index cd642e07f..4ba2f8757 100644
--- a/lapack-netlib/SRC/slaqr4.f
+++ b/lapack-netlib/SRC/slaqr4.f
@@ -287,7 +287,7 @@
 *     .    SLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -371,22 +371,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -434,7 +434,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -585,7 +585,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use SLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -700,7 +700,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f
index f04ee577e..d60a1d3c0 100644
--- a/lapack-netlib/SRC/slaqr5.f
+++ b/lapack-netlib/SRC/slaqr5.f
@@ -70,10 +70,9 @@
 *>             matrix entries.
 *>        = 1: SLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: SLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -178,14 +177,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is REAL array, dimension (LDU,3*NSHFTS-3)
+*>          U is REAL array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -197,7 +196,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is REAL array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is REAL array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -223,7 +222,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -234,7 +233,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup realOTHERauxiliary
 *
@@ -243,6 +242,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -252,10 +256,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
      $                   SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
      $                   LDU, NV, WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -282,11 +291,11 @@
       REAL               ALPHA, BETA, H11, H12, H21, H22, REFSUM,
      $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
      $                   ULP
-      INTEGER            I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -356,10 +365,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -371,28 +376,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS-1 columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -401,17 +417,134 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 20 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
+     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
+     $                         V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP ) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
+                     IF( TST1.EQ.ZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + ABS( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + ABS( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + ABS( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + ABS( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + ABS( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + ABS( H( K+4, K+1 ) )
+                     END IF
+                     IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) )
+     $                    THEN
+                        H12 = MAX( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H21 = MIN( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H11 = MAX( ABS( H( K+1, K+1 ) ),
+     $                        ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( ABS( H( K+1, K+1 ) ),
+     $                        ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) ) THEN
+                           H( K+1, K ) = ZERO
+                        END IF
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ),
      $                         SI( 2*M-1 ), SR( 2*M ), SI( 2*M ),
@@ -419,7 +552,20 @@
                   ALPHA = V( 1, M )
                   CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*V( 2, M )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -467,7 +613,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -481,154 +627,29 @@
                      END IF
                   END IF
                END IF
-   20       CONTINUE
 *
-*           ==== Generate a 2-by-2 reflection, if needed. ====
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====      
 *
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
-     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
-     $                         V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
-*
-*           ==== Multiply H by reflections from the left ====
-*
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 40 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 30 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
-     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   30          CONTINUE
-   40       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 50 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   50          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 90 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 60 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
      $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
-                     H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
-   60             CONTINUE
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
+   70          CONTINUE
 *
-                  IF( ACCUM ) THEN
+*              ==== Perform update from left for subsequent
+*              .    column. ====
 *
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 70 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
-   70                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 80 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
-                        Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
-   80                CONTINUE
-                  END IF
-               END IF
-   90       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 100 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
-  100             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 110 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*
-     $                                  V( 2, M22 )
-  110                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 120 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
-  120                CONTINUE
-                  END IF
-               END IF
-            END IF
-*
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 130 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+               REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
+     $                  H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -639,6 +660,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
                   IF( TST1.EQ.ZERO ) THEN
@@ -667,25 +690,77 @@
                      TST2 = H22*( H11 / SCL )
 *
                      IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
-     $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+     $                   MAX( SMLNUM, ULP*TST2 ) ) THEN
+                        H( K+1, K ) = ZERO
+                     END IF
                   END IF
                END IF
-  130       CONTINUE
+   80       CONTINUE
 *
-*           ==== Fill in the last row of each bulge. ====
+*           ==== Multiply H by reflections from the left ====
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 140 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*V( 2, M )
-               H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M )
-  140       CONTINUE
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
+     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
+*
+*           ==== Accumulate orthogonal transformations. ====
+*
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  150    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -699,220 +774,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
 *
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
+*           ==== Horizontal Multiply ====
 *
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+     $                     LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+     $                     LDWH )
+               CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH,
+     $                      H( INCOL+K1, JCOL ), LDH )
+  150       CONTINUE
 *
-*              ==== Horizontal Multiply ====
+*           ==== Vertical multiply ====
 *
-               DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
-     $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
-     $                        LDWH )
-                  CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH,
-     $                         H( INCOL+K1, JCOL ), LDH )
-  160          CONTINUE
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
 *
-*              ==== Vertical multiply ====
+*           ==== Z multiply (also vertical) ====
 *
-               DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
+     $                         Z( JROW, INCOL+K1 ), LDZ )
   170          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 180 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  180             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL SLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**T ====
-*
-                  CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL SLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL SLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  190          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL SLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL SLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  200          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 210 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL SLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL SLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  210             CONTINUE
-               END IF
             END IF
          END IF
-  220 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of SLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f
new file mode 100644
index 000000000..7719f2965
--- /dev/null
+++ b/lapack-netlib/SRC/slarfb_gett.f
@@ -0,0 +1,596 @@
+*> \brief \b SLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFB_GETT applies a real Householder block reflector H from the
+*> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup singleOTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**T)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T**A1_in
+*>
+*>       B1_out: = - V2*T**A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**T) * W2
+*>                      = (unit_lower_tr_of_(A1)**T) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**T) * W1
+*>                    = (unit_lower_tr_of_(A1)**T) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGEMM, STRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
+     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = ZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of SLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f
index 6e5f86447..219797c4a 100644
--- a/lapack-netlib/SRC/slasq2.f
+++ b/lapack-netlib/SRC/slasq2.f
@@ -183,10 +183,18 @@
 *
 *        2-by-2 case.
 *
-         IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN
-            INFO = -2
+         IF( Z( 1 ).LT.ZERO ) THEN
+            INFO = -201
+            CALL XERBLA( 'DLASQ2', 2 )
+            RETURN
+         ELSE IF( Z( 2 ).LT.ZERO ) THEN
+            INFO = -202
             CALL XERBLA( 'SLASQ2', 2 )
             RETURN
+         ELSE IF( Z( 3 ).LT.ZERO ) THEN
+           INFO = -203
+           CALL XERBLA( 'SLASQ2', 2 )
+           RETURN
          ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN
             D = Z( 3 )
             Z( 3 ) = Z( 1 )
diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f
index dccdbb58a..2266505dc 100644
--- a/lapack-netlib/SRC/sorgbr.f
+++ b/lapack-netlib/SRC/sorgbr.f
@@ -221,8 +221,8 @@
                CALL SORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL SORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL SORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -230,8 +230,8 @@
                CALL SORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL SORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL SORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f
new file mode 100644
index 000000000..d2a2150cd
--- /dev/null
+++ b/lapack-netlib/SRC/sorgtsqr_row.f
@@ -0,0 +1,379 @@
+*> \brief \b SORGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SORGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sorgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sorgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sorgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SORGTSQR_ROW generates an M-by-N real matrix Q_out with
+*> orthonormal columns from the output of SLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by SLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of SLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine SLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which SLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by SLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by SLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by SLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See SLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See SLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup sigleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      REAL               DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLARFB_GETT, SLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SORGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL SLASET('U', M, N, ZERO, ONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = REAL( LWORKOPT )
+      RETURN
+*
+*     End of SORGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/SRC/stgsja.f b/lapack-netlib/SRC/stgsja.f
index 2a6fc354d..7324da431 100644
--- a/lapack-netlib/SRC/stgsja.f
+++ b/lapack-netlib/SRC/stgsja.f
@@ -400,7 +400,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      REAL               ZERO, ONE
+      REAL               ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *     ..
 *     .. Local Scalars ..
@@ -419,7 +419,8 @@
      $                   SSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, MAX, MIN
+      INTRINSIC          ABS, MAX, MIN, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -596,9 +597,9 @@
 *
          A1 = A( K+I, N-L+I )
          B1 = B( I, N-L+I )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
 *           change sign if necessary
 *
diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f
index bb9d2c26e..2209f4733 100644
--- a/lapack-netlib/SRC/zgesdd.f
+++ b/lapack-netlib/SRC/zgesdd.f
@@ -281,9 +281,9 @@
      $                   ZLASET, ZUNGBR, ZUNGLQ, ZUNGQR, ZUNMBR
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, DISNAN
       DOUBLE PRECISION   DLAMCH, ZLANGE
-      EXTERNAL           LSAME, DLAMCH, ZLANGE
+      EXTERNAL           LSAME, DLAMCH, ZLANGE, DISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -647,6 +647,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = ZLANGE( 'M', M, N, A, LDA, DUM )
+      IF( DISNAN( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f
new file mode 100644
index 000000000..5f0167937
--- /dev/null
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b ZGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a complex M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in ZGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of ZGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup comlpex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for ZLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for ZUNGTSQR_ROW;
+*        d) Diagonal D for ZUNHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of ZUNGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
+            DO J = I, N
+               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = DCMPLX( LWORKOPT )
+      RETURN
+*
+*     End of ZGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f
index d6a30cee7..d4adc5c4d 100644
--- a/lapack-netlib/SRC/zggglm.f
+++ b/lapack-netlib/SRC/zggglm.f
@@ -271,8 +271,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = CZERO
+         END DO
+         DO I = 1, P
+            Y(I) = CZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f
index b28ae47a4..960244727 100644
--- a/lapack-netlib/SRC/zhgeqz.f
+++ b/lapack-netlib/SRC/zhgeqz.f
@@ -319,7 +319,7 @@
       DOUBLE PRECISION   ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
      $                   C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
       COMPLEX*16         ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
-     $                   CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
+     $                   CTEMP3, ESHIFT, S, SHIFT, SIGNBC,
      $                   U12, X, ABI12, Y
 *     ..
 *     .. External Functions ..
@@ -352,6 +352,7 @@
          ILSCHR = .TRUE.
          ISCHUR = 2
       ELSE
+         ILSCHR = .TRUE.
          ISCHUR = 0
       END IF
 *
@@ -365,6 +366,7 @@
          ILQ = .TRUE.
          ICOMPQ = 3
       ELSE
+         ILQ = .TRUE.
          ICOMPQ = 0
       END IF
 *
@@ -378,6 +380,7 @@
          ILZ = .TRUE.
          ICOMPZ = 3
       ELSE
+         ILZ = .TRUE.
          ICOMPZ = 0
       END IF
 *
diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f
index 2ee874dfd..e0fddd3a7 100644
--- a/lapack-netlib/SRC/zhseqr.f
+++ b/lapack-netlib/SRC/zhseqr.f
@@ -320,10 +320,10 @@
 *     .    ZLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare ZLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare ZLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f
index feffe9782..edf01bc7c 100644
--- a/lapack-netlib/SRC/zlaqr0.f
+++ b/lapack-netlib/SRC/zlaqr0.f
@@ -262,7 +262,7 @@
 *     .    ZLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -357,22 +357,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -420,7 +420,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -560,7 +560,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use ZLAQR4 or
 *                 .    ZLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -661,7 +661,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f
index a88f6508e..6d083fcda 100644
--- a/lapack-netlib/SRC/zlaqr4.f
+++ b/lapack-netlib/SRC/zlaqr4.f
@@ -268,7 +268,7 @@
 *     .    ZLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -363,22 +363,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -426,7 +426,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -566,7 +566,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use ZLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -661,7 +661,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f
index 9ff7e7eca..c12f4b780 100644
--- a/lapack-netlib/SRC/zlaqr5.f
+++ b/lapack-netlib/SRC/zlaqr5.f
@@ -69,10 +69,9 @@
 *>             matrix entries.
 *>        = 1: ZLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: ZLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -170,14 +169,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is COMPLEX*16 array, dimension (LDU,3*NSHFTS-3)
+*>          U is COMPLEX*16 array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -189,7 +188,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is COMPLEX*16 array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is COMPLEX*16 array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -215,7 +214,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -226,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup complex16OTHERauxiliary
 *
@@ -235,6 +234,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -244,10 +248,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
      $                   H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV,
      $                   WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -276,11 +285,11 @@
       COMPLEX*16         ALPHA, BETA, CDUM, REFSUM
       DOUBLE PRECISION   H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
      $                   SMLNUM, TST1, TST2, ULP
-      INTEGER            I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -334,10 +343,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -349,28 +354,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL ZLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -379,24 +395,156 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 10 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
+     $                         S( 2*M22 ), V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*DCONJG( V( 2, M22 ) )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = DCONJG( V( 1, M22 ) )*
+     $                     ( H( K+1, J )+DCONJG( V( 2, M22 ) )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP ) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
+                     IF( TST1.EQ.RZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + CABS1( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + CABS1( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + CABS1( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + CABS1( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + CABS1( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + CABS1( H( K+4, K+1 ) )
+                     END IF
+                     IF( CABS1( H( K+1, K ) )
+     $                   .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN
+                        H12 = MAX( CABS1( H( K+1, K ) ),
+     $                     CABS1( H( K, K+1 ) ) )
+                        H21 = MIN( CABS1( H( K+1, K ) ),
+     $                     CABS1( H( K, K+1 ) ) )
+                        H11 = MAX( CABS1( H( K+1, K+1 ) ),
+     $                     CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( CABS1( H( K+1, K+1 ) ),
+     $                     CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*DCONJG( V( 2, M22 ) )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*DCONJG( V( 2, M22 ) )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL ZLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ),
      $                         S( 2*M ), V( 1, M ) )
                   ALPHA = V( 1, M )
                   CALL ZLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) -
+     $                            REFSUM*DCONJG( V( 3, M ) )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL ZLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -444,7 +592,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -458,163 +606,32 @@
                      END IF
                   END IF
                END IF
-   10       CONTINUE
 *
-*           ==== Generate a 2-by-2 reflection, if needed. ====
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====
 *
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
-     $                         S( 2*M22 ), V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*DCONJG( V( 2, M ) )
+                  H( J, K+3 ) = H( J, K+3 ) -
+     $                          REFSUM*DCONJG( V( 3, M ) )
+   70          CONTINUE
 *
-*           ==== Multiply H by reflections from the left ====
+*              ==== Perform update from left for subsequent
+*              .    column. ====
 *
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 30 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 20 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = DCONJG( V( 1, M ) )*
-     $                     ( H( K+1, J )+DCONJG( V( 2, M ) )*
-     $                     H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   20          CONTINUE
-   30       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 40 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = DCONJG( V( 1, M22 ) )*
-     $                     ( H( K+1, J )+DCONJG( V( 2, M22 ) )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   40          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 80 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 50 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*DCONJG( V( 2, M ) )
-                     H( J, K+3 ) = H( J, K+3 ) -
-     $                             REFSUM*DCONJG( V( 3, M ) )
-   50             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 60 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*DCONJG( V( 2, M ) )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) -
-     $                                  REFSUM*DCONJG( V( 3, M ) )
-   60                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 70 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*DCONJG( V( 2, M ) )
-                        Z( J, K+3 ) = Z( J, K+3 ) -
-     $                                REFSUM*DCONJG( V( 3, M ) )
-   70                CONTINUE
-                  END IF
-               END IF
-   80       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 90 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*DCONJG( V( 2, M22 ) )
-   90             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 100 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*DCONJG( V( 2, M22 ) )
-  100                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 110 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*DCONJG( V( 2, M22 ) )
-  110                CONTINUE
-                  END IF
-               END IF
-            END IF
-*
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 120 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+               REFSUM =  DCONJG( V( 1, M ) )*( H( K+1, K+1 )
+     $                  +DCONJG( V( 2, M ) )*H( K+2, K+1 )
+     $                  +DCONJG( V( 3, M ) )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -625,6 +642,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
                   IF( TST1.EQ.RZERO ) THEN
@@ -658,23 +677,77 @@
      $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
                   END IF
                END IF
-  120       CONTINUE
+   80       CONTINUE
 *
-*           ==== Fill in the last row of each bulge. ====
+*           ==== Multiply H by reflections from the left ====
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 130 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*DCONJG( V( 2, M ) )
-               H( K+4, K+3 ) = H( K+4, K+3 ) -
-     $                         REFSUM*DCONJG( V( 3, M ) )
-  130       CONTINUE
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = DCONJG( V( 1, M ) )*
+     $                     ( H( K+1, J )+DCONJG( V( 2, M ) )*
+     $                     H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
+*
+*           ==== Accumulate orthogonal transformations. ====
+*
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*DCONJG( V( 2, M ) )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) -
+     $                               REFSUM*DCONJG( V( 3, M ) )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*DCONJG( V( 2, M ) )
+                     Z( J, K+3 ) = Z( J, K+3 ) -
+     $                             REFSUM*DCONJG( V( 3, M ) )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  140    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -688,220 +761,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
 *
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
+*           ==== Horizontal Multiply ====
 *
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+     $                     LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+     $                     LDWH )
+               CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH,
+     $                      H( INCOL+K1, JCOL ), LDH )
+  150       CONTINUE
 *
-*              ==== Horizontal Multiply ====
+*           ==== Vertical multiply ====
 *
-               DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
-     $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
-     $                        LDWH )
-                  CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH,
-     $                         H( INCOL+K1, JCOL ), LDH )
-  150          CONTINUE
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
 *
-*              ==== Vertical multiply ====
+*           ==== Z multiply (also vertical) ====
 *
-               DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
-  160          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 170 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  170             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL ZLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL ZLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL ZTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**H ====
-*
-                  CALL ZGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL ZLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL ZTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL ZGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL ZLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  180          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL ZLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL ZLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  190          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 200 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL ZLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL ZLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  200             CONTINUE
-               END IF
+     $                         Z( JROW, INCOL+K1 ), LDZ )
+  170          CONTINUE
             END IF
          END IF
-  210 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of ZLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f
new file mode 100644
index 000000000..4a3c4dcf1
--- /dev/null
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@@ -0,0 +1,597 @@
+*> \brief \b ZLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFB_GETT applies a complex Householder block reflector H from the
+*> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**H)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*A1_in
+*>
+*>       B1_out: = - V2*T*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**H) * W2
+*>                      = (unit_lower_tr_of_(A1)**H) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**H) * W1
+*>                    = (unit_lower_tr_of_(A1)**H) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZGEMM, ZTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
+     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = CZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of ZLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/ztgsja.f b/lapack-netlib/SRC/ztgsja.f
index 851f6504a..c80e33158 100644
--- a/lapack-netlib/SRC/ztgsja.f
+++ b/lapack-netlib/SRC/ztgsja.f
@@ -401,7 +401,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      DOUBLE PRECISION   ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
       COMPLEX*16         CZERO, CONE
       PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
@@ -424,7 +424,8 @@
      $                   ZLASET, ZROT
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, DBLE, DCONJG, MAX, MIN
+      INTRINSIC          ABS, DBLE, DCONJG, MAX, MIN, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -610,9 +611,9 @@
 *
          A1 = DBLE( A( K+I, N-L+I ) )
          B1 = DBLE( B( I, N-L+I ) )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
             IF( GAMMA.LT.ZERO ) THEN
                CALL ZDSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB )
diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f
index 3cdb8127d..c1c35822c 100644
--- a/lapack-netlib/SRC/zungbr.f
+++ b/lapack-netlib/SRC/zungbr.f
@@ -222,8 +222,8 @@
                CALL ZUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL ZUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL ZUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -231,8 +231,8 @@
                CALL ZUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL ZUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL ZUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/zungtsqr_row.f b/lapack-netlib/SRC/zungtsqr_row.f
new file mode 100644
index 000000000..0d32ad6ce
--- /dev/null
+++ b/lapack-netlib/SRC/zungtsqr_row.f
@@ -0,0 +1,380 @@
+*> \brief \b ZUNGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZUNGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
+*> orthonormal columns from the output of ZLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by ZLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of ZLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine ZLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which ZLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by ZLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by ZLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by ZLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See ZLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See ZLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      COMPLEX*16         DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLARFB_GETT, ZLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZUNGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL ZLASET('U', M, N, CZERO, CONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = DCMPLX( LWORKOPT )
+      RETURN
+*
+*     End of ZUNGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F
index 0d3d7493c..de4aed696 100644
--- a/lapack-netlib/TESTING/EIG/cchkee.F
+++ b/lapack-netlib/TESTING/EIG/cchkee.F
@@ -1871,7 +1871,7 @@
          CALL XLAENV( 9, 25 )
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
-            N_THREADS = OMP_GET_NUM_THREADS()
+            N_THREADS = OMP_GET_MAX_THREADS()
             CALL OMP_SET_NUM_THREADS(1)
 #endif
             CALL CERRST( 'CST', NOUT )
@@ -2338,7 +2338,7 @@
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
-            N_THREADS = OMP_GET_NUM_THREADS()
+            N_THREADS = OMP_GET_MAX_THREADS()
             CALL OMP_SET_NUM_THREADS(1)
 #endif
             CALL CERRST( 'CHB', NOUT )
diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F
index ee22ce33d..00e8eb57f 100644
--- a/lapack-netlib/TESTING/EIG/dchkee.F
+++ b/lapack-netlib/TESTING/EIG/dchkee.F
@@ -1876,7 +1876,7 @@
          CALL XLAENV( 9, 25 )
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
-            N_THREADS = OMP_GET_NUM_THREADS()
+            N_THREADS = OMP_GET_MAX_THREADS()
             CALL OMP_SET_NUM_THREADS(1)
 #endif
             CALL DERRST( 'DST', NOUT )
diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F
index a063c18b5..c3f9ca162 100644
--- a/lapack-netlib/TESTING/EIG/schkee.F
+++ b/lapack-netlib/TESTING/EIG/schkee.F
@@ -1877,7 +1877,7 @@
          CALL XLAENV( 9, 25 )
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
-            N_THREADS = OMP_GET_NUM_THREADS()
+            N_THREADS = OMP_GET_MAX_THREADS()
             CALL OMP_SET_NUM_THREADS(1)
 #endif
             CALL SERRST( 'SST', NOUT )
diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F
index 29604956d..908b7d651 100644
--- a/lapack-netlib/TESTING/EIG/zchkee.F
+++ b/lapack-netlib/TESTING/EIG/zchkee.F
@@ -1871,7 +1871,7 @@
          CALL XLAENV( 9, 25 )
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
-            N_THREADS = OMP_GET_NUM_THREADS()
+            N_THREADS = OMP_GET_MAX_THREADS()
             CALL OMP_SET_NUM_THREADS(1)
 #endif
             CALL ZERRST( 'ZST', NOUT )
@@ -2336,7 +2336,7 @@
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
-            N_THREADS = OMP_GET_NUM_THREADS()
+            N_THREADS = OMP_GET_MAX_THREADS()
             CALL OMP_SET_NUM_THREADS(1)
 #endif
             CALL ZERRST( 'ZHB', NOUT )
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 0d0bb5418..309ed7e77 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -40,7 +40,7 @@ set(SLINTST schkaa.f
    sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f
    schklqt.f schklqtp.f schktsqr.f
    serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f
-   schkorhr_col.f serrorhr_col.f sorhr_col01.f)
+   schkorhr_col.f serrorhr_col.f sorhr_col01.f sorhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f
@@ -96,7 +96,7 @@ set(CLINTST cchkaa.f
    cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f
    cchklqt.f cchklqtp.f cchktsqr.f
    cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f
-   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f)
+   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f cunhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f
@@ -142,7 +142,7 @@ set(DLINTST dchkaa.f
    dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f
    dchklq.f dchklqt.f dchklqtp.f dchktsqr.f
    derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f
-   dchkorhr_col.f derrorhr_col.f dorhr_col01.f)
+   dchkorhr_col.f derrorhr_col.f dorhr_col01.f dorhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f
@@ -198,7 +198,7 @@ set(ZLINTST zchkaa.f
    zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f
    zchklqt.f zchklqtp.f zchktsqr.f
    zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f
-   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f)
+   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f zunhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 6e790aa93..674265816 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -74,7 +74,7 @@ SLINTST = schkaa.o \
    sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \
    schklqt.o schklqtp.o schktsqr.o \
    serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \
-   schkorhr_col.o serrorhr_col.o sorhr_col01.o
+   schkorhr_col.o serrorhr_col.o sorhr_col01.o sorhr_col02.o
 
 ifdef USEXBLAS
 SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \
@@ -123,7 +123,7 @@ CLINTST = cchkaa.o \
    cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \
    cchklqt.o cchklqtp.o cchktsqr.o \
    cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \
-   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o
+   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o cunhr_col02.o
 
 ifdef USEXBLAS
 CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \
@@ -167,7 +167,7 @@ DLINTST = dchkaa.o \
    dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \
    dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \
    derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \
-   dchkorhr_col.o derrorhr_col.o dorhr_col01.o
+   dchkorhr_col.o derrorhr_col.o dorhr_col01.o dorhr_col02.o
 
 ifdef USEXBLAS
 DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \
@@ -215,7 +215,7 @@ ZLINTST = zchkaa.o \
    zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \
    zchklqt.o zchklqtp.o zchktsqr.o \
    zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \
-   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o
+   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o zunhr_col02.o
 
 ifdef USEXBLAS
 ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \
diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
index 00077ddd9..0d6a9063d 100644
--- a/lapack-netlib/TESTING/LIN/cchkunhr_col.f
+++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
@@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR
-*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested
-*> before this test.
+*> CCHKUNHR_COL tests:
+*>   1) CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT,
+*>   2) CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT
+*>      (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
+*> have to be tested before this test.
 *>
 *> \endverbatim
 *
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex_lin
 *
 *  =====================================================================
-      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,10 +135,11 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01
+      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01,
+     $                   CUNHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -201,8 +202,8 @@
 *
 *                             Test CUNHR_COL
 *
-                              CALL CUNHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL CUNHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test CUNHR_COL
+*
+                              CALL CUNHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'CUNGTSQR and CUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'CUNGTSQR_ROW and CUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of CCHKUNHR_COL
diff --git a/lapack-netlib/TESTING/LIN/cdrvgex.f b/lapack-netlib/TESTING/LIN/cdrvgex.f
index 51fc84899..9b075908f 100644
--- a/lapack-netlib/TESTING/LIN/cdrvgex.f
+++ b/lapack-netlib/TESTING/LIN/cdrvgex.f
@@ -707,9 +707,10 @@
                      CALL CLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA )
 
                      IF( .NOT.PREFAC )
-     $                  CALL CLASET( 'Full', N, N, ZERO, ZERO, AFAC,
-     $                               LDA )
-                     CALL CLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA )
+     $                  CALL CLASET( 'Full', N, N, CMPLX( ZERO ),
+     $                               CMPLX( ZERO ), AFAC, LDA )
+                     CALL CLASET( 'Full', N, NRHS, CMPLX( ZERO ),
+     $                            CMPLX( ZERO ), X, LDA )
                      IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN
 *
 *                       Equilibrate the matrix if FACT = 'F' and
diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
index 32be41f64..959258e1f 100644
--- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
@@ -449,11 +449,11 @@
 *                    Reconstruct matrix from factors and compute
 *                    residual.
 *
-c                     CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA,
-c     $                                  IWORK, AINV, LDA, RWORK,
-c     $                                  RESULT( 2 ) )
-c                     NT = 2
-					  NT = 1
+c                    CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA,
+c    $                                  IWORK, AINV, LDA, RWORK,
+c    $                                  RESULT( 2 ) )
+c                    NT = 2
+                     NT = 1
 *
 *                    Print information about the tests that did not pass
 *                    the threshold.
diff --git a/lapack-netlib/TESTING/LIN/cdrvrfp.f b/lapack-netlib/TESTING/LIN/cdrvrfp.f
index a57688f83..362a0e7cb 100644
--- a/lapack-netlib/TESTING/LIN/cdrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/cdrvrfp.f
@@ -449,19 +449,19 @@
 *                       Form the inverse of A.
 *
                         CALL CPOTRI( UPLO, N, A, LDA, INFO )
+
+                        IF ( N .NE. 0 ) THEN
 *
-*                       Compute the 1-norm condition number of A.
+*                          Compute the 1-norm condition number of A.
 *
-      					IF ( N .NE. 0 ) THEN
                            AINVNM = CLANHE( '1', UPLO, N, A, LDA,
      +                           S_WORK_CLANHE )
                            RCONDC = ( ONE / ANORM ) / AINVNM
 *
 *                          Restore the matrix A.
 *
-                        CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA )
+                           CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA )
                         END IF
-
 *
                      END IF
 *
diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f
index d760caba5..d77d60b1a 100644
--- a/lapack-netlib/TESTING/LIN/cunhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f
@@ -13,7 +13,7 @@
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
-*       REAL              RESULT(6)
+*       DOUBLE PRECISION  RESULT(6)
 *
 *
 *> \par Purpose:
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR.
-*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR
+*> CUNHR_COL01 tests CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in CGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using CGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using CGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
-*> \ingroup complex16_lin
+*> \ingroup complex_lin
 *
 *  =====================================================================
       SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
@@ -102,10 +131,10 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      COMPLEX, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      COMPLEX         , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
      $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
      $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
-      REAL, ALLOCATABLE :: RWORK(:)
+      REAL            , ALLOCATABLE :: RWORK(:)
 *
 *     .. Parameters ..
       REAL               ZERO
@@ -218,7 +247,7 @@
 *     Copy the factor R into the array R.
 *
       SRNAMT = 'CLACPY'
-      CALL CLACPY( 'U', M, N, AF, M, R, M )
+      CALL CLACPY( 'U', N, N, AF, M, R, M )
 *
 *     Reconstruct the orthogonal matrix Q.
 *
@@ -240,7 +269,7 @@
 *     matrix S.
 *
       SRNAMT = 'CLACPY'
-      CALL CLACPY( 'U', M, N, R, M, AF, M )
+      CALL CLACPY( 'U', N, N, R, M, AF, M )
 *
       DO I = 1, N
          IF( DIAG( I ).EQ.-CONE ) THEN
diff --git a/lapack-netlib/TESTING/LIN/cunhr_col02.f b/lapack-netlib/TESTING/LIN/cunhr_col02.f
new file mode 100644
index 000000000..001f291da
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cunhr_col02.f
@@ -0,0 +1,381 @@
+*> \brief \b CUNHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       REAL              RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CUNHR_COL02 tests CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT
+*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is REAL array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in CGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using CGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using CGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      REAL              RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      COMPLEX         , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+      REAL            , ALLOCATABLE :: RWORK(:)
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0E+0 )
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      REAL               ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      COMPLEX            WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, CLANGE, CLANSY
+      EXTERNAL           SLAMCH, CLANGE, CLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLACPY, CLARNV, CLASET, CGETSQRHRT,
+     $                   CSCAL, CGEMM, CGEMQRT, CHERK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = SLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL CLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL CLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in CLATSQR
+*
+      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     CGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+*
+      CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'CGETSQRHRT'
+      CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M )
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M )
+*
+      CALL CLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
+*
+      ANORM = CLANGE( '1', M, N, A, M, RWORK )
+      RESID = CLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL CLASET( 'Full', M, M, CZERO, CONE, R, M )
+      CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
+      RESID = CLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL CLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = CLANGE( '1', M, N, C, M, RWORK )
+      CALL CLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = CLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL CLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = CLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL CLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = CLANGE( '1', N, M, D, N, RWORK )
+      CALL CLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = CLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL CLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = CLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of CUNHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
index 3b3e421eb..0e2d44d8d 100644
--- a/lapack-netlib/TESTING/LIN/dchkorhr_col.f
+++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
@@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR
-*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested
-*> before this test.
+*> DCHKORHR_COL tests:
+*>   1) DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT,
+*>   2) DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT
+*>      (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
+*> have to be tested before this test.
 *>
 *> \endverbatim
 *
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup double_lin
 *
 *  =====================================================================
-      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,10 +135,11 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01
+      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01,
+     $                   DORHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -201,8 +202,8 @@
 *
 *                             Test DORHR_COL
 *
-                              CALL DORHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL DORHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test DORHR_COL
+*
+                              CALL DORHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'DORGTSQR and DORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'DORGTSQR_ROW and DORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of DCHKORHR_COL
diff --git a/lapack-netlib/TESTING/LIN/ddrvrfp.f b/lapack-netlib/TESTING/LIN/ddrvrfp.f
index d67cf6713..18ccbdfc4 100644
--- a/lapack-netlib/TESTING/LIN/ddrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/ddrvrfp.f
@@ -443,8 +443,7 @@
 *
                         CALL DPOTRI( UPLO, N, A, LDA, INFO )
 
-      					IF ( N .NE. 0 ) THEN
-
+                        IF ( N .NE. 0 ) THEN
 *
 *                          Compute the 1-norm condition number of A.
 *
diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f
index 3e48de37f..979255ca9 100644
--- a/lapack-netlib/TESTING/LIN/dorhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR.
-*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR
+*> DORHR_COL01 tests DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using DGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using DGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
-*> \ingroup single_lin
+*> \ingroup double_lin
 *
 *  =====================================================================
       SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
diff --git a/lapack-netlib/TESTING/LIN/dorhr_col02.f b/lapack-netlib/TESTING/LIN/dorhr_col02.f
new file mode 100644
index 000000000..d4c438edb
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dorhr_col02.f
@@ -0,0 +1,377 @@
+*> \brief \b DORHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       DOUBLE PRECISION  RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DORHR_COL02 tests DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT
+*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is DOUBLE PRECISION array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using DGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using DGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      DOUBLE PRECISION  RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      DOUBLE PRECISION, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      DOUBLE PRECISION   WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLANSY
+      EXTERNAL           DLAMCH, DLANGE, DLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLACPY, DLARNV, DLASET, DGETSQRHRT,
+     $                   DSCAL, DGEMM, DGEMQRT, DSYRK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = DLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL DLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL DLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in DLATSQR
+*
+      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     DGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+*
+      CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'DGETSQRHRT'
+      CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M )
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M )
+*
+      CALL DLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
+*
+      ANORM = DLANGE( '1', M, N, A, M, RWORK )
+      RESID = DLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL DLASET( 'Full', M, M, ZERO, ONE, R, M )
+      CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
+      RESID = DLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL DLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = DLANGE( '1', M, N, C, M, RWORK )
+      CALL DLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = DLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL DLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = DLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL DLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = DLANGE( '1', N, M, D, N, RWORK )
+      CALL DLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = DLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL DLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = DLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of DORHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f
index cf6d2d323..f61b74902 100644
--- a/lapack-netlib/TESTING/LIN/schkorhr_col.f
+++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f
@@ -24,8 +24,11 @@
 *>
 *> \verbatim
 *>
-*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
-*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> SCHKORHR_COL tests:
+*>   1) SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT,
+*>   2) SORGTSQR_ROW and SORHR_COL inside DGETSQRHRT
+*>      (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
-*> \ingroup sigle_lin
+*> \ingroup single_lin
 *
 *  =====================================================================
-      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     June 2019
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,7 +135,8 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01
+      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01,
+     $                   SORHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -201,8 +202,8 @@
 *
 *                             Test SORHR_COL
 *
-                              CALL SORHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL SORHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test SORHR_COL
+*
+                              CALL SORHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'SORGTSQR and SORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'SORGTSQR_ROW and SORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of SCHKORHR_COL
diff --git a/lapack-netlib/TESTING/LIN/sdrvrfp.f b/lapack-netlib/TESTING/LIN/sdrvrfp.f
index 4b022bcfb..c0eb4d564 100644
--- a/lapack-netlib/TESTING/LIN/sdrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/sdrvrfp.f
@@ -443,7 +443,7 @@
 *
                         CALL SPOTRI( UPLO, N, A, LDA, INFO )
 
-      					IF ( N .NE. 0 ) THEN
+                        IF ( N .NE. 0 ) THEN
 *
 *                          Compute the 1-norm condition number of A.
 *
diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f
index 02429041b..dcc2c1cae 100644
--- a/lapack-netlib/TESTING/LIN/sorhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f
@@ -8,12 +8,12 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT)
+*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
-*       REAL             RESULT(6)
+*       REAL              RESULT(6)
 *
 *
 *> \par Purpose:
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
-*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> SORHR_COL01 tests SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in SGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using SGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using SGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup single_lin
 *
 *  =====================================================================
       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
@@ -102,7 +131,7 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      REAL, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      REAL            , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
      $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
      $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
 *
@@ -128,7 +157,7 @@
      $                   SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          CEILING, MAX, MIN, REAL
+      INTRINSIC          CEILING, REAL, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       CHARACTER(LEN=32)  SRNAMT
@@ -230,7 +259,7 @@
 *
 *     Compute the factor R_hr corresponding to the Householder
 *     reconstructed Q_hr and place it in the upper triangle of AF to
-*     match the Q storage format in DGEQRT. R_hr = R_tsqr * S,
+*     match the Q storage format in SGEQRT. R_hr = R_tsqr * S,
 *     this means changing the sign of I-th row of the matrix R_tsqr
 *     according to sign of of I-th diagonal element DIAG(I) of the
 *     matrix S.
diff --git a/lapack-netlib/TESTING/LIN/sorhr_col02.f b/lapack-netlib/TESTING/LIN/sorhr_col02.f
new file mode 100644
index 000000000..1cbe40577
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/sorhr_col02.f
@@ -0,0 +1,376 @@
+*> \brief \b SORHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       REAL              RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SORHR_COL02 tests SORGTSQR_ROW and SORHR_COL inside SGETSQRHRT
+*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is REAL array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in SGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using SGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using SGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      REAL              RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      REAL            , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      REAL               ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      REAL               WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SLANGE, SLANSY
+      EXTERNAL           SLAMCH, SLANGE, SLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLACPY, SLARNV, SLASET, SGETSQRHRT,
+     $                   SSCAL, SGEMM, SGEMQRT, SSYRK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = SLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL SLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL SLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in SLATSQR
+*
+      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     SGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+      CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'SGETSQRHRT'
+      CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M )
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M )
+*
+      CALL SLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
+*
+      ANORM = SLANGE( '1', M, N, A, M, RWORK )
+      RESID = SLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL SLASET( 'Full', M, M, ZERO, ONE, R, M )
+      CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
+      RESID = SLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL SLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = SLANGE( '1', M, N, C, M, RWORK )
+      CALL SLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = SLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL SLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = SLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL SLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = SLANGE( '1', N, M, D, N, RWORK )
+      CALL SLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = SLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL SLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = SLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of SORHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
index ef8f8bcc4..395ea178a 100644
--- a/lapack-netlib/TESTING/LIN/zchkunhr_col.f
+++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
@@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR
-*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested
-*> before this test.
+*> ZCHKUNHR_COL tests:
+*>   1) ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT,
+*>   2) ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT
+*>      (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
+*> have to be tested before this test.
 *>
 *> \endverbatim
 *
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex16_lin
 *
 *  =====================================================================
-      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,10 +135,11 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01
+      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01,
+     $                   ZUNHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -201,8 +202,8 @@
 *
 *                             Test ZUNHR_COL
 *
-                              CALL ZUNHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL ZUNHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test ZUNHR_COL
+*
+                              CALL ZUNHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'ZUNGTSQR and ZUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'ZUNGTSQR_ROW and ZUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of ZCHKUNHR_COL
diff --git a/lapack-netlib/TESTING/LIN/zdrvgex.f b/lapack-netlib/TESTING/LIN/zdrvgex.f
index cdfa10727..1b784d31b 100644
--- a/lapack-netlib/TESTING/LIN/zdrvgex.f
+++ b/lapack-netlib/TESTING/LIN/zdrvgex.f
@@ -707,9 +707,10 @@
                      CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA )
 
                      IF( .NOT.PREFAC )
-     $                  CALL ZLASET( 'Full', N, N, ZERO, ZERO, AFAC,
-     $                               LDA )
-                     CALL ZLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA )
+     $                  CALL ZLASET( 'Full', N, N, DCMPLX( ZERO ),
+     $                               DCMPLX( ZERO ), AFAC, LDA )
+                     CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                            DCMPLX( ZERO ), X, LDA )
                      IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN
 *
 *                       Equilibrate the matrix if FACT = 'F' and
diff --git a/lapack-netlib/TESTING/LIN/zdrvhex.f b/lapack-netlib/TESTING/LIN/zdrvhex.f
index 3c0dfbfe4..527114508 100644
--- a/lapack-netlib/TESTING/LIN/zdrvhex.f
+++ b/lapack-netlib/TESTING/LIN/zdrvhex.f
@@ -599,10 +599,10 @@
 *                 Restore the matrices A and B.
 *
                   IF( IFACT.EQ.2 )
-     $               CALL ZLASET( UPLO, N, N, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), AFAC, LDA )
-                  CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), X, LDA )
+     $               CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), AFAC, LDA )
+                  CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), X, LDA )
 *
 *                 Solve the system and compute the condition number
 *                 and error bounds using ZHESVXX.
diff --git a/lapack-netlib/TESTING/LIN/zdrvpox.f b/lapack-netlib/TESTING/LIN/zdrvpox.f
index 260d8c1f2..0bc2c89d8 100644
--- a/lapack-netlib/TESTING/LIN/zdrvpox.f
+++ b/lapack-netlib/TESTING/LIN/zdrvpox.f
@@ -611,10 +611,10 @@
                      CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA )
 
                      IF( .NOT.PREFAC )
-     $                  CALL ZLASET( UPLO, N, N, CMPLX( ZERO ),
-     $                               CMPLX( ZERO ), AFAC, LDA )
-                     CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ),
-     $                            CMPLX( ZERO ), X, LDA )
+     $                  CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ),
+     $                               DCMPLX( ZERO ), AFAC, LDA )
+                     CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                            DCMPLX( ZERO ), X, LDA )
                      IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN
 *
 *                       Equilibrate the matrix if FACT='F' and
diff --git a/lapack-netlib/TESTING/LIN/zdrvrfp.f b/lapack-netlib/TESTING/LIN/zdrvrfp.f
index c7be7da03..b299a487b 100644
--- a/lapack-netlib/TESTING/LIN/zdrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/zdrvrfp.f
@@ -450,7 +450,7 @@
 *
                         CALL ZPOTRI( UPLO, N, A, LDA, INFO )
 
-      					IF ( N .NE. 0 ) THEN
+                        IF ( N .NE. 0 ) THEN
 *
 *                          Compute the 1-norm condition number of A.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvsyx.f b/lapack-netlib/TESTING/LIN/zdrvsyx.f
index 9431cd692..e4556f150 100644
--- a/lapack-netlib/TESTING/LIN/zdrvsyx.f
+++ b/lapack-netlib/TESTING/LIN/zdrvsyx.f
@@ -605,10 +605,10 @@
 *                 Restore the matrices A and B.
 *
                   IF( IFACT.EQ.2 )
-     $               CALL ZLASET( UPLO, N, N, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), AFAC, LDA )
-                  CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), X, LDA )
+     $               CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), AFAC, LDA )
+                  CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), X, LDA )
 *
 *                 Solve the system and compute the condition number
 *                 and error bounds using ZSYSVXX.
diff --git a/lapack-netlib/TESTING/LIN/zerrvxx.f b/lapack-netlib/TESTING/LIN/zerrvxx.f
index 9dc008215..bdaf44d8a 100644
--- a/lapack-netlib/TESTING/LIN/zerrvxx.f
+++ b/lapack-netlib/TESTING/LIN/zerrvxx.f
@@ -1166,7 +1166,7 @@
      $        2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N,
      $        ERR_BNDS_C, NPARAMS, PARAMS, W, RW, INFO )
          CALL CHKXER( 'ZSYSVXX', INFOT, NOUT, LERR, OK )
-	     INFOT = 13
+         INFOT = 13
          EQ = 'N'
          CALL ZSYSVXX( 'N', 'U', 2, 0, A, 2, AF, 2, IP, EQ, R, B, 1, X,
      $        2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N,
diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f
index 9fb3bf352..b7590a8ea 100644
--- a/lapack-netlib/TESTING/LIN/zunhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR.
-*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR
+*> ZUNHR_COL01 tests ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using ZGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using ZGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex16_lin
 *
 *  =====================================================================
       SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
@@ -102,7 +131,7 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      COMPLEX*16, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      COMPLEX*16      , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
      $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
      $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
       DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
@@ -218,7 +247,7 @@
 *     Copy the factor R into the array R.
 *
       SRNAMT = 'ZLACPY'
-      CALL ZLACPY( 'U', M, N, AF, M, R, M )
+      CALL ZLACPY( 'U', N, N, AF, M, R, M )
 *
 *     Reconstruct the orthogonal matrix Q.
 *
@@ -240,7 +269,7 @@
 *     matrix S.
 *
       SRNAMT = 'ZLACPY'
-      CALL ZLACPY( 'U', M, N, R, M, AF, M )
+      CALL ZLACPY( 'U', N, N, R, M, AF, M )
 *
       DO I = 1, N
          IF( DIAG( I ).EQ.-CONE ) THEN
diff --git a/lapack-netlib/TESTING/LIN/zunhr_col02.f b/lapack-netlib/TESTING/LIN/zunhr_col02.f
new file mode 100644
index 000000000..c6e7f80cd
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zunhr_col02.f
@@ -0,0 +1,381 @@
+*> \brief \b ZUNHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       DOUBLE PRECISION  RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZUNHR_COL02 tests ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT
+*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is DOUBLE PRECISION array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using ZGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using ZGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      DOUBLE PRECISION  RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      COMPLEX*16      , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+      DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D+0 )
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      COMPLEX*16         WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANSY
+      EXTERNAL           DLAMCH, ZLANGE, ZLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLACPY, ZLARNV, ZLASET, ZGETSQRHRT,
+     $                   ZSCAL, ZGEMM, ZGEMQRT, ZHERK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = DLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL ZLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL ZLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in ZLATSQR
+*
+      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     ZGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+*
+      CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'ZGETSQRHRT'
+      CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M )
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M )
+*
+      CALL ZLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
+*
+      ANORM = ZLANGE( '1', M, N, A, M, RWORK )
+      RESID = ZLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M )
+      CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
+      RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL ZLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = ZLANGE( '1', M, N, C, M, RWORK )
+      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL ZLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = ZLANGE( '1', N, M, D, N, RWORK )
+      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of ZUNHR_COL02
+*
+      END
diff --git a/param.h b/param.h
index c41f75ec9..a35ce69bd 100644
--- a/param.h
+++ b/param.h
@@ -2466,13 +2466,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P 512
 #define DGEMM_DEFAULT_P 384
-#define CGEMM_DEFAULT_P  512
+#define CGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_P 256
 
 #define SGEMM_DEFAULT_Q 512
 #define DGEMM_DEFAULT_Q 512
-#define CGEMM_DEFAULT_Q  1026
-#define ZGEMM_DEFAULT_Q 1026
+#define CGEMM_DEFAULT_Q 384
+#define ZGEMM_DEFAULT_Q 384 
 
 #define SGEMM_DEFAULT_R 4096
 #define DGEMM_DEFAULT_R 4096