Merge pull request #3214 from martin-frbg/lapack-3.9.1hrt

Add new Householder Reconstruction functions from LAPACK 3.9.1
2021-05-02 23:40:03 +02:00 · 2021-05-02 23:40:03 +02:00 · e72420e8c5
parent b8232c9054 d00709e016
commit e72420e8c5
48 changed files with 9053 additions and 158 deletions
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@ -66,7 +66,7 @@ set(SLASRC
   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
   slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
   slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
+   slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
   slarrv.f slartv.f
   slarz.f  slarzb.f slarzt.f slasy2.f
   slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@ -112,14 +112,14 @@ set(SLASRC
   sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
   stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
   sgelqt.f sgelqt3.f sgemlqt.f
-   sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
+   sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
   sgelq.f slaswlq.f slamswlq.f sgemlq.f
   stplqt.f stplqt2.f stpmlqt.f
   ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
   ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
   sgesvdq.f slaorhr_col_getrfnp.f
-   slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
   sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -171,7 +171,7 @@ set(CLASRC
   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
   claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
   claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarfb.f clarfg.f clarfgp.f clarft.f
+   clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
   clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
   clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f
   clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@ -209,14 +209,14 @@ set(CLASRC
   cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
   ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
   cgelqt.f cgelqt3.f cgemlqt.f
-   cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
+   cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
   cgelq.f claswlq.f clamswlq.f cgemlq.f
   ctplqt.f ctplqt2.f ctpmlqt.f
   chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
   cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
   cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
-   cungtsqr.f cunhr_col.f )
+   cungtsqr.f cungtsqr_row.f cunhr_col.f )
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
   cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -253,7 +253,7 @@ set(DLASRC
   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
   dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
   dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
   dlargv.f dlarrv.f dlartv.f
   dlarz.f  dlarzb.f dlarzt.f dlasy2.f
   dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@ -300,14 +300,14 @@ set(DLASRC
   dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
   dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
   dgelqt.f dgelqt3.f dgemlqt.f
-   dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
+   dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
   dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
   dtplqt.f dtplqt2.f dtpmlqt.f
   dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
   dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
   dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
-   dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
   dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -360,7 +360,7 @@ set(ZLASRC
   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
   zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
   zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
   zlarfg.f zlarfgp.f zlarft.f
   zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
   zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
@ -402,13 +402,13 @@ set(ZLASRC
   ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
   ztplqt.f ztplqt2.f ztpmlqt.f
   zgelqt.f zgelqt3.f zgemlqt.f
-   zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
+   zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
   zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
   zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
   zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
   zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
-   zungtsqr.f zunhr_col.f)
+   zungtsqr.f zungtsqr_row.f zunhr_col.f)
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
   zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@ -114,6 +114,8 @@ set(CSRC
  lapacke_cgetrs_work.c
  lapacke_cgetsls.c
  lapacke_cgetsls_work.c
  lapacke_cgetsqrhrt.c
  lapacke_cgetsqrhrt_work.c 
  lapacke_cggbak.c
  lapacke_cggbak_work.c
  lapacke_cggbal.c
@ -590,6 +592,8 @@ set(CSRC
  lapacke_cungrq_work.c
  lapacke_cungtr.c
  lapacke_cungtr_work.c
  lapacke_cungtsqr_row.c
  lapacke_cungtsqr_row_work.c
  lapacke_cunmbr.c
  lapacke_cunmbr_work.c
  lapacke_cunmhr.c
@ -735,6 +739,8 @@ set(DSRC
  lapacke_dgetrs_work.c
  lapacke_dgetsls.c
  lapacke_dgetsls_work.c
  lapacke_dgetsqrhrt.c
  lapacke_dgetsqrhrt_work.c
  lapacke_dggbak.c
  lapacke_dggbak_work.c
  lapacke_dggbal.c
@ -862,6 +868,8 @@ set(DSRC
  lapacke_dorgrq_work.c
  lapacke_dorgtr.c
  lapacke_dorgtr_work.c
  lapacke_dorgtsqr_row.c
  lapacke_dorgtsqr_row_work.c
  lapacke_dormbr.c
  lapacke_dormbr_work.c
  lapacke_dormhr.c
@ -1309,6 +1317,8 @@ set(SSRC
  lapacke_sgetrs_work.c
  lapacke_sgetsls.c
  lapacke_sgetsls_work.c
  lapacke_sgetsqrhrt.c
  lapacke_sgetsqrhrt_work.c
  lapacke_sggbak.c
  lapacke_sggbak_work.c
  lapacke_sggbal.c
@ -1435,6 +1445,8 @@ set(SSRC
  lapacke_sorgrq_work.c
  lapacke_sorgtr.c
  lapacke_sorgtr_work.c
  lapacke_sorgtsqr_row.c
  lapacke_sorgtsqr_row_work.c
  lapacke_sormbr.c
  lapacke_sormbr_work.c
  lapacke_sormhr.c
@ -1877,6 +1889,8 @@ set(ZSRC
  lapacke_zgetrs_work.c
  lapacke_zgetsls.c
  lapacke_zgetsls_work.c
  lapacke_zgetsqrhrt.c
  lapacke_zgetsqrhrt_work.c
  lapacke_zggbak.c
  lapacke_zggbak_work.c
  lapacke_zggbal.c
@ -2351,6 +2365,8 @@ set(ZSRC
  lapacke_zungrq_work.c
  lapacke_zungtr.c
  lapacke_zungtr_work.c
  lapacke_zungtsqr_row.c
  lapacke_zungtsqr_row_work.c
  lapacke_zunmbr.c
  lapacke_zunmbr_work.c
  lapacke_zunmhr.c
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@ -2941,6 +2941,42 @@ void LAPACK_zgetsls(
    lapack_complex_double* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT)
 void LAPACK_cgetsqrhrt(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
    lapack_complex_float* A, lapack_int const* lda,
    lapack_complex_float* T, lapack_int const* ldt,
    lapack_complex_float* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT)
 void LAPACK_dgetsqrhrt(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
    double* A, lapack_int const* lda,
    double* T, lapack_int const* ldt,
    double* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT)
 void LAPACK_sgetsqrhrt(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
    float* A, lapack_int const* lda,
    float* T, lapack_int const* ldt,
    float* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT)
 void LAPACK_zgetsqrhrt(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
    lapack_complex_double* A, lapack_int const* lda,
    lapack_complex_double* T, lapack_int const* ldt,
    lapack_complex_double* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
 void LAPACK_cggbak(
    char const* job, char const* side,
@ -7251,6 +7287,24 @@ void LAPACK_sorgtr(
    float* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW)
 void LAPACK_dorgtsqr_row(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb, lapack_int const* nb,
    double* A, lapack_int const* lda,
    double const* T, lapack_int const* ldt,
    double* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW)
 void LAPACK_sorgtsqr_row(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb, lapack_int const* nb,
    float* A, lapack_int const* lda,
    float const* T, lapack_int const* ldt,
    float* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr(
    char const* vect, char const* side, char const* trans,
@ -13540,6 +13594,24 @@ void LAPACK_zungtr(
    lapack_complex_double* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW)
 void LAPACK_cungtsqr_row(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb, lapack_int const* nb,
    lapack_complex_float* A, lapack_int const* lda,
    lapack_complex_float const* T, lapack_int const* ldt,
    lapack_complex_float* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW)
 void LAPACK_zungtsqr_row(
    lapack_int const* m, lapack_int const* n,
    lapack_int const* mb, lapack_int const* nb,
    lapack_complex_double* A, lapack_int const* lda,
    lapack_complex_double const* T, lapack_int const* ldt,
    lapack_complex_double* work, lapack_int const* lwork,
    lapack_int* info );
 #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr(
    char const* vect, char const* side, char const* trans,
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a,
 lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a,
                           lapack_int lda, const double* tau );
 lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 float* a, lapack_int lda,
                                 const float* t, lapack_int ldt );
 lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 double* a, lapack_int lda,
                                 const double* t, lapack_int ldt );
 lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans,
                           lapack_int m, lapack_int n, lapack_int k,
                           const float* a, lapack_int lda, const float* tau,
@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n,
                           lapack_complex_double* a, lapack_int lda,
                           const lapack_complex_double* tau );
 lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 lapack_complex_float* a, lapack_int lda,
                                 const lapack_complex_float* t, lapack_int ldt );
 lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 lapack_complex_double* a, lapack_int lda,
                                 const lapack_complex_double* t, lapack_int ldt );
 lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans,
                           lapack_int m, lapack_int n, lapack_int k,
                           const lapack_complex_float* a, lapack_int lda,
@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n,
                                double* a, lapack_int lda, const double* tau,
                                double* work, lapack_int lwork );
 lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout,
                                      lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      float* a, lapack_int lda,
                                      const float* t, lapack_int ldt,
                                      float* work, lapack_int lwork );
 lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout,
                                      lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      double* a, lapack_int lda,
                                      const double* t, lapack_int ldt,
                                      double* work, lapack_int lwork );
 lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side,
                                char trans, lapack_int m, lapack_int n,
                                lapack_int k, const float* a, lapack_int lda,
@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n,
                                const lapack_complex_double* tau,
                                lapack_complex_double* work, lapack_int lwork );
 lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout,
                                      lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      lapack_complex_float* a, lapack_int lda,
                                      const lapack_complex_float* t, lapack_int ldt,
                                      lapack_complex_float* work, lapack_int lwork );
 lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout,
                                      lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      lapack_complex_double* a, lapack_int lda,
                                      const lapack_complex_double* t, lapack_int ldt,
                                      lapack_complex_double* work, lapack_int lwork );
 lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side,
                                char trans, lapack_int m, lapack_int n,
                                lapack_int k, const lapack_complex_float* a,
@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m,
                                 lapack_complex_double* b, lapack_int ldb,
                                 lapack_complex_double* work, lapack_int lwork );
 lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               float* a, lapack_int lda,
                               float* t, lapack_int ldt );
 lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               double* a, lapack_int lda,
                               double* t, lapack_int ldt );
 lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               lapack_complex_float* a, lapack_int lda,
                               lapack_complex_float* t, lapack_int ldt );
 lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               lapack_complex_double* a, lapack_int lda,
                               lapack_complex_double* t, lapack_int ldt );
 lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    float* a, lapack_int lda,
                                    float* t, lapack_int ldt,
                                    float* work, lapack_int lwork );
 lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    double* a, lapack_int lda,
                                    double* t, lapack_int ldt,
                                    double* work, lapack_int lwork );
 lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    lapack_complex_float* a, lapack_int lda,
                                    lapack_complex_float* t, lapack_int ldt,
                                    lapack_complex_float* work, lapack_int lwork );
 lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    lapack_complex_double* a, lapack_int lda,
                                    lapack_complex_double* t, lapack_int ldt,
                                    lapack_complex_double* work, lapack_int lwork );
 lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
                          float* a, lapack_int lda, float* w );
 lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@ -162,6 +162,8 @@ lapacke_cgetrs.o \
 lapacke_cgetrs_work.o \
 lapacke_cgetsls.o \
 lapacke_cgetsls_work.o \
 lapacke_cgetsqrhrt.o \
 lapacke_cgetsqrhrt_work.o \
 lapacke_cggbak.o \
 lapacke_cggbak_work.o \
 lapacke_cggbal.o \
@ -634,6 +636,8 @@ lapacke_cungrq.o \
 lapacke_cungrq_work.o \
 lapacke_cungtr.o \
 lapacke_cungtr_work.o \
 lapacke_cungtsqr_row.o \
 lapacke_cungtsqr_row_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@ -778,6 +782,8 @@ lapacke_dgetrs.o \
 lapacke_dgetrs_work.o \
 lapacke_dgetsls.o \
 lapacke_dgetsls_work.o \
 lapacke_dgetsqrhrt.o \
 lapacke_dgetsqrhrt_work.o \
 lapacke_dggbak.o \
 lapacke_dggbak_work.o \
 lapacke_dggbal.o \
@ -900,6 +906,8 @@ lapacke_dorgrq.o \
 lapacke_dorgrq_work.o \
 lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
 lapacke_dorgtsqr_row.o \
 lapacke_dorgtsqr_row_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \
 lapacke_sgetrs_work.o \
 lapacke_sgetsls.o \
 lapacke_sgetsls_work.o \
 lapacke_sgetsqrhrt.o \
 lapacke_sgetsqrhrt_work.o \
 lapacke_sggbak.o \
 lapacke_sggbak_work.o \
 lapacke_sggbal.o \
@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \
 lapacke_sorgrq_work.o \
 lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
 lapacke_sorgtsqr_row.o \
 lapacke_sorgtsqr_row_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \
 lapacke_zgetrs_work.o \
 lapacke_zgetsls.o \
 lapacke_zgetsls_work.o \
 lapacke_zgetsqrhrt.o \
 lapacke_zgetsqrhrt_work.o \
 lapacke_zggbak.o \
 lapacke_zggbak_work.o \
 lapacke_zggbal.o \
@ -2380,6 +2394,8 @@ lapacke_zungrq.o \
 lapacke_zungrq_work.o \
 lapacke_zungtr.o \
 lapacke_zungtr_work.o \
 lapacke_zungtsqr_row.o \
 lapacke_zungtsqr_row_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \
--- a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
@ -0,0 +1,80 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function cgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               lapack_complex_float* a, lapack_int lda,
                               lapack_complex_float* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    lapack_complex_float* work = NULL;
    lapack_complex_float work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -7;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
    work = (lapack_complex_float*)
        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
@ -0,0 +1,108 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function cgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    lapack_complex_float* a, lapack_int lda,
                                    lapack_complex_float* t, lapack_int ldt,
                                    lapack_complex_float* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        lapack_complex_float* a_t = NULL;
        lapack_int ldt_t = MAX(1,nb2);
        lapack_complex_float* t_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -8;
            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
            return info;
        }
        if( ldt < n ) {
            info = -10;
            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
                               work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (lapack_complex_float*)
            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (lapack_complex_float*)
            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
@ -0,0 +1,83 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function cungtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 lapack_complex_float* a, lapack_int lda,
                                 const lapack_complex_float* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    lapack_complex_float* work = NULL;
    lapack_complex_float work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -6;
        }
        if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
            return -8;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
    work = (lapack_complex_float*)
        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
@ -0,0 +1,109 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function cungtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      lapack_complex_float* a, lapack_int lda,
                                      const lapack_complex_float* t, lapack_int ldt,
                                      lapack_complex_float* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if (matrix_layout == LAPACK_COL_MAJOR) {
        /* Call LAPACK function and adjust info */
        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
                             work, &lwork, &info);
        if (info < 0) {
            info = info - 1;
        }
    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
        lapack_int lda_t = MAX(1,m);
        lapack_complex_float* a_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -7;
            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
            return info;
        }
        lapack_int ldt_t = MAX(1,nb);
        lapack_complex_float* t_t = NULL;
        /* Check leading dimension(s) */
        if( ldt < n ) {
            info = -9;
            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
                                 work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (lapack_complex_float*)
            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (lapack_complex_float*)
            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
        /* Call LAPACK function and adjust info */
        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
                             work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
@ -0,0 +1,79 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function dgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               double* a, lapack_int lda,
                               double* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    double* work = NULL;
    double work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -7;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = (lapack_int)work_query;
    /* Allocate memory for work arrays */
    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
@ -0,0 +1,106 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function dgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    double* a, lapack_int lda,
                                    double* t, lapack_int ldt,
                                    double* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        double* a_t = NULL;
        lapack_int ldt_t = MAX(1,nb2);
        double* t_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -8;
            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
            return info;
        }
        if( ldt < n ) {
            info = -10;
            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
                               work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
@ -0,0 +1,82 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function dorgtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 double* a, lapack_int lda,
                                 const double* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    double* work = NULL;
    double work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -6;
        }
        if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
            return -8;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = (lapack_int)work_query;
    /* Allocate memory for work arrays */
    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
@ -0,0 +1,108 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function dorgtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      double* a, lapack_int lda,
                                      const double* t, lapack_int ldt,
                                      double* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if (matrix_layout == LAPACK_COL_MAJOR) {
        /* Call LAPACK function and adjust info */
        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
                             work, &lwork, &info);
        if (info < 0) {
            info = info - 1;
        }
    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
        lapack_int lda_t = MAX(1,m);
        double* a_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -7;
            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
            return info;
        }
        lapack_int ldt_t = MAX(1,nb);
        double* t_t = NULL;
        /* Check leading dimension(s) */
        if( ldt < n ) {
            info = -9;
            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
                                 work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
        /* Call LAPACK function and adjust info */
        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
                             work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c
@ -0,0 +1,79 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function sgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               float* a, lapack_int lda,
                               float* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    float* work = NULL;
    float work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -7;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = (lapack_int)work_query;
        /* Allocate memory for work arrays */
    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c
@ -0,0 +1,106 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function sgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    float* a, lapack_int lda,
                                    float* t, lapack_int ldt,
                                    float* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        float* a_t = NULL;
        lapack_int ldt_t = MAX(1,nb2);
        float* t_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -8;
            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
            return info;
        }
        if( ldt < n ) {
            info = -10;
            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
                               work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c
@ -0,0 +1,82 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function sorgtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 float* a, lapack_int lda,
                                 const float* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    float* work = NULL;
    float work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -6;
        }
        if( LAPACKE_sge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
            return -8;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = (lapack_int)work_query;
    /* Allocate memory for work arrays */
    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c
@ -0,0 +1,108 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function sorgtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      float* a, lapack_int lda,
                                      const float* t, lapack_int ldt,
                                      float* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if (matrix_layout == LAPACK_COL_MAJOR) {
        /* Call LAPACK function and adjust info */
        LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
                             work, &lwork, &info);
        if (info < 0) {
            info = info - 1;
        }
    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
        lapack_int lda_t = MAX(1,m);
        float* a_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -7;
            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
            return info;
        }
        lapack_int ldt_t = MAX(1,nb);
        float* t_t = NULL;
        /* Check leading dimension(s) */
        if( ldt < n ) {
            info = -9;
            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
                                 work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        LAPACKE_sge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
        /* Call LAPACK function and adjust info */
        LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
                             work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c
@ -0,0 +1,80 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function zgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
                               lapack_complex_double* a, lapack_int lda,
                               lapack_complex_double* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    lapack_complex_double* work = NULL;
    lapack_complex_double work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -7;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = LAPACK_Z2INT( work_query );
    /* Allocate memory for work arrays */
    work = (lapack_complex_double*)
        LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
                                    a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c
@ -0,0 +1,108 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function zgetsqrhrt
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
                                    lapack_complex_double* a, lapack_int lda,
                                    lapack_complex_double* t, lapack_int ldt,
                                    lapack_complex_double* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        lapack_complex_double* a_t = NULL;
        lapack_int ldt_t = MAX(1,nb2);
        lapack_complex_double* t_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -8;
            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
            return info;
        }
        if( ldt < n ) {
            info = -10;
            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
                               work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (lapack_complex_double*)
            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (lapack_complex_double*)
            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
                           work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c
@ -0,0 +1,83 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native high-level C interface to LAPACK function zungtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
                                 lapack_int mb, lapack_int nb,
                                 lapack_complex_double* a, lapack_int lda,
                                 const lapack_complex_double* t, lapack_int ldt )
 {
    lapack_int info = 0;
    lapack_int lwork = -1;
    lapack_complex_double* work = NULL;
    lapack_complex_double work_query;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_zungtsqr_row", -1 );
        return -1;
    }
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -6;
        }
        if( LAPACKE_zge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
            return -8;
        }
    }
 #endif
    /* Query optimal working array(s) size */
    info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, &work_query, lwork );
    if( info != 0 ) {
        goto exit_level_0;
    }
    lwork = LAPACK_Z2INT( work_query );
    /* Allocate memory for work arrays */
    work = (lapack_complex_double*)
        LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
    if( work == NULL ) {
        info = LAPACK_WORK_MEMORY_ERROR;
        goto exit_level_0;
    }
    /* Call middle-level interface */
    info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb,
                                      a, lda, t, ldt, work, lwork );
    /* Release memory and exit */
    LAPACKE_free( work );
 exit_level_0:
    if( info == LAPACK_WORK_MEMORY_ERROR ) {
        LAPACKE_xerbla( "LAPACKE_zungtsqr_row", info );
    }
    return info;
 }
--- a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c
@ -0,0 +1,109 @@
 /*****************************************************************************
  Copyright (c) 2020, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************
 * Contents: Native middle-level C interface to LAPACK function zungtsqr_row
 * Author: Intel Corporation
 *****************************************************************************/
 #include "lapacke_utils.h"
 lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
                                      lapack_int mb, lapack_int nb,
                                      lapack_complex_double* a, lapack_int lda,
                                      const lapack_complex_double* t, lapack_int ldt,
                                      lapack_complex_double* work, lapack_int lwork )
 {
    lapack_int info = 0;
    if (matrix_layout == LAPACK_COL_MAJOR) {
        /* Call LAPACK function and adjust info */
        LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
                             work, &lwork, &info);
        if (info < 0) {
            info = info - 1;
        }
    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
        lapack_int lda_t = MAX(1,m);
        lapack_complex_double* a_t = NULL;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -7;
            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
            return info;
        }
        lapack_int ldt_t = MAX(1,nb);
        lapack_complex_double* t_t = NULL;
        /* Check leading dimension(s) */
        if( ldt < n ) {
            info = -9;
            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
            return info;
        }
        /* Query optimal working array(s) size if requested */
        if( lwork == -1 ) {
            LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
                                 work, &lwork, &info );
            return (info < 0) ? (info - 1) : info;
        }
        /* Allocate memory for temporary array(s) */
        a_t = (lapack_complex_double*)
            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
        if( a_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_0;
        }
        t_t = (lapack_complex_double*)
            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) );
        if( t_t == NULL ) {
            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
            goto exit_level_1;
        }
        /* Transpose input matrices */
        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
        LAPACKE_zge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
        /* Call LAPACK function and adjust info */
        LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
                             work, &lwork, &info );
        if( info < 0 ) {
            info = info - 1;
        }
        /* Transpose output matrices */
        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
        /* Release memory and exit */
        LAPACKE_free( t_t );
 exit_level_1:
        LAPACKE_free( a_t );
 exit_level_0:
        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
        }
    } else {
        info = -1;
        LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
    }
    return info;
 }
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@ -135,14 +135,14 @@ SLASRC_O = \
   slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
   slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
   slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
-   slarf.o  slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
+   slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
   slarrv.o slartv.o \
   slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
   slasyf_rk.o \
   slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \
   slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \
   sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \
-   sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \
+   sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \
   sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \
   sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \
   spbstf.o spbsv.o  spbsvx.o \
@ -181,7 +181,7 @@ SLASRC_O = \
   sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \
   stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \
   sgelqt.o sgelqt3.o sgemlqt.o \
-   sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
+   sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
   sgelq.o slaswlq.o slamswlq.o sgemlq.o \
   stplqt.o stplqt2.o stpmlqt.o \
   sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \
@ -250,7 +250,7 @@ CLASRC_O = \
   claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
   claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
   claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
-   clarf.o  clarfb.o clarfg.o clarft.o clarfgp.o \
+   clarf.o  clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
   clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
   clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
   claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@ -278,7 +278,7 @@ CLASRC_O = \
   ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
   ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \
   cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
-   cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
+   cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
   cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
   cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \
   chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \
@ -289,7 +289,7 @@ CLASRC_O = \
   cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \
   ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o \
   cgelqt.o cgelqt3.o cgemlqt.o \
-   cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
+   cgetsls.o cgetsqrhrt.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
   cgelq.o claswlq.o clamswlq.o cgemlq.o \
   ctplqt.o ctplqt2.o ctpmlqt.o \
   cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \
@ -342,14 +342,14 @@ DLASRC_O = \
   dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
   dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
   dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
-   dlarf.o  dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
+   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
   dlargv.o dlarrv.o dlartv.o \
   dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
   dlasyf.o dlasyf_rook.o dlasyf_rk.o \
   dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \
   dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \
   dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \
-   dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \
+   dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \
   dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \
   dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \
   dpbstf.o dpbsv.o  dpbsvx.o \
@ -389,7 +389,7 @@ DLASRC_O = \
   dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \
   dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \
   dgelqt.o dgelqt3.o dgemlqt.o \
-   dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
+   dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
   dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
   dtplqt.o dtplqt2.o dtpmlqt.o \
   dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \
@ -455,7 +455,7 @@ ZLASRC_O = \
   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
   zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
   zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
-   zlarcm.o zlarf.o  zlarfb.o \
+   zlarcm.o zlarf.o  zlarfb.o zlarfb_gett.o \
   zlarfg.o zlarft.o zlarfgp.o \
   zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
   zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
@ -484,7 +484,7 @@ ZLASRC_O = \
   ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \
   ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \
   zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
-   zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
+   zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
   zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \
   zunmtr.o zupgtr.o \
   zupmtr.o izmax1.o dzsum1.o zstemr.o \
@ -498,7 +498,7 @@ ZLASRC_O = \
   ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \
   ztplqt.o ztplqt2.o ztpmlqt.o \
   zgelqt.o zgelqt3.o zgemlqt.o \
-   zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
+   zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
   zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
   zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \
   zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
--- a/lapack-netlib/SRC/cgetsqrhrt.f
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@ -0,0 +1,349 @@
 *> \brief \b CGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a complex M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in CGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of CGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is COMPLEX array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup comlpexOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX            CONE
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, REAL, CMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for CLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for CUNGTSQR_ROW;
 *        d) Diagonal D for CUNHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of CUNGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *
      CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
            DO J = I, N
               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = CMPLX( LWORKOPT )
      RETURN
 *
 *     End of CGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/clarfb_gett.f
+++ b/lapack-netlib/SRC/clarfb_gett.f
@ -0,0 +1,597 @@
 *> \brief \b CLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *>
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CLARFB_GETT applies a complex Householder block reflector H from the
 *> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is COMPLEX array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complexOTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**H)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*A1_in
 *>
 *>       B1_out: = - V2*T*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**H) * W2
 *>                      = (unit_lower_tr_of_(A1)**H) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**H) * W1
 *>                    = (unit_lower_tr_of_(A1)**H) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX            CONE, CZERO
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CCOPY, CGEMM, CTRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO
         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = CZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of CLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/cungtsqr_row.f
+++ b/lapack-netlib/SRC/cungtsqr_row.f
@ -0,0 +1,380 @@
 *> \brief \b CUNGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CUNGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *>
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
 *> orthonormal columns from the output of CLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by CLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of CLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine CLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which CLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by CLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by CLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by CLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See CLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See CLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complexOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX            CONE, CZERO
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      COMPLEX            DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CLARFB_GETT, CLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CUNGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL CLASET('U', M, N, CZERO, CONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )
         END IF
 *
      END DO
 *
      WORK( 1 ) = CMPLX( LWORKOPT )
      RETURN
 *
 *     End of CUNGTSQR_ROW
 *
      END
--- a/lapack-netlib/SRC/dgetsqrhrt.f
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@ -0,0 +1,349 @@
 *> \brief \b DGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download DGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a real M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in DGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of DGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is DOUBLE PRECISION array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE
      PARAMETER          ( ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, DBLE, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for DLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for DORGTSQR_ROW;
 *        d) Diagonal D for DORHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of DORGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *
      CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
            DO J = I, N
               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = DBLE( LWORKOPT )
      RETURN
 *
 *     End of DGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/dlarfb_gett.f
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@ -0,0 +1,596 @@
 *> \brief \b DLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download DLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DLARFB_GETT applies a real Householder block reflector H from the
 *> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is DOUBLE PRECISION array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is DOUBLE PRECISION array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**T)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T**A1_in
 *>
 *>       B1_out: = - V2*T**A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**T) * W2
 *>                      = (unit_lower_tr_of_(A1)**T) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**T) * W1
 *>                    = (unit_lower_tr_of_(A1)**T) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE, ZERO
      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DCOPY, DGEMM, DTRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO
         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = ZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of DLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/dorgtsqr_row.f
+++ b/lapack-netlib/SRC/dorgtsqr_row.f
@ -0,0 +1,379 @@
 *> \brief \b DORGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download DORGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dorgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DORGTSQR_ROW generates an M-by-N real matrix Q_out with
 *> orthonormal columns from the output of DLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by DLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of DLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine DLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which DLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by DLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by DLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by DLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See DLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is DOUBLE PRECISION array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See DLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE, ZERO
      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      DOUBLE PRECISION   DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DLARFB_GETT, DLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          DBLE, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DORGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL DLASET('U', M, N, ZERO, ONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )
         END IF
 *
      END DO
 *
      WORK( 1 ) = DBLE( LWORKOPT )
      RETURN
 *
 *     End of DORGTSQR_ROW
 *
      END
--- a/lapack-netlib/SRC/sgetsqrhrt.f
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@ -0,0 +1,349 @@
 *> \brief \b SGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download SGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a complex M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in SGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of SGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is REAL array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup singleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ONE
      PARAMETER          ( ONE = 1.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for SLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for SORGTSQR_ROW;
 *        d) Diagonal D for SORHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of SORGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *
      CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
            DO J = I, N
               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = REAL( LWORKOPT )
      RETURN
 *
 *     End of SGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/slarfb_gett.f
+++ b/lapack-netlib/SRC/slarfb_gett.f
@ -0,0 +1,596 @@
 *> \brief \b SLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download SLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SLARFB_GETT applies a real Householder block reflector H from the
 *> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is REAL array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is REAL array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup singleOTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**T)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T**A1_in
 *>
 *>       B1_out: = - V2*T**A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**T) * W2
 *>                      = (unit_lower_tr_of_(A1)**T) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**T) * W1
 *>                    = (unit_lower_tr_of_(A1)**T) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ONE, ZERO
      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SCOPY, SGEMM, STRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO
         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = ZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of SLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/sorgtsqr_row.f
+++ b/lapack-netlib/SRC/sorgtsqr_row.f
@ -0,0 +1,379 @@
 *> \brief \b SORGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download SORGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sorgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sorgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sorgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SORGTSQR_ROW generates an M-by-N real matrix Q_out with
 *> orthonormal columns from the output of SLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by SLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of SLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine SLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which SLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by SLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by SLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by SLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See SLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is REAL array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See SLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup sigleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ONE, ZERO
      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      REAL               DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SLARFB_GETT, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          REAL, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SORGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL SLASET('U', M, N, ZERO, ONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )
         END IF
 *
      END DO
 *
      WORK( 1 ) = REAL( LWORKOPT )
      RETURN
 *
 *     End of SORGTSQR_ROW
 *
      END
--- a/lapack-netlib/SRC/zgetsqrhrt.f
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@ -0,0 +1,349 @@
 *> \brief \b ZGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a complex M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in ZGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of ZGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is COMPLEX*16 array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup comlpex16OTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX*16         CONE
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, DBLE, DCMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for ZLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for ZUNGTSQR_ROW;
 *        d) Diagonal D for ZUNHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of ZUNGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *
      CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
            DO J = I, N
               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = DCMPLX( LWORKOPT )
      RETURN
 *
 *     End of ZGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/zlarfb_gett.f
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@ -0,0 +1,597 @@
 *> \brief \b ZLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZLARFB_GETT applies a complex Householder block reflector H from the
 *> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX*16 array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is COMPLEX*16 array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16OTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**H)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*A1_in
 *>
 *>       B1_out: = - V2*T*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**H) * W2
 *>                      = (unit_lower_tr_of_(A1)**H) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**H) * W1
 *>                    = (unit_lower_tr_of_(A1)**H) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX*16         CONE, CZERO
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZCOPY, ZGEMM, ZTRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO
         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = CZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of ZLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/zungtsqr_row.f
+++ b/lapack-netlib/SRC/zungtsqr_row.f
@ -0,0 +1,380 @@
 *> \brief \b ZUNGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZUNGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
 *> orthonormal columns from the output of ZLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by ZLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of ZLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine ZLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which ZLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by ZLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by ZLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by ZLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See ZLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX*16 array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See ZLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16OTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX*16         CONE, CZERO
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      COMPLEX*16         DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZLARFB_GETT, ZLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          DCMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZUNGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL ZLASET('U', M, N, CZERO, CONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )
         END IF
 *
      END DO
 *
      WORK( 1 ) = DCMPLX( LWORKOPT )
      RETURN
 *
 *     End of ZUNGTSQR_ROW
 *
      END
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@ -40,7 +40,7 @@ set(SLINTST schkaa.f
   sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f
   schklqt.f schklqtp.f schktsqr.f
   serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f
-   schkorhr_col.f serrorhr_col.f sorhr_col01.f)
+   schkorhr_col.f serrorhr_col.f sorhr_col01.f sorhr_col02.f)
 if(USE_XBLAS)
  list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f
@ -96,7 +96,7 @@ set(CLINTST cchkaa.f
   cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f
   cchklqt.f cchklqtp.f cchktsqr.f
   cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f
-   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f)
+   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f cunhr_col02.f)
 if(USE_XBLAS)
  list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f
@ -142,7 +142,7 @@ set(DLINTST dchkaa.f
   dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f
   dchklq.f dchklqt.f dchklqtp.f dchktsqr.f
   derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f
-   dchkorhr_col.f derrorhr_col.f dorhr_col01.f)
+   dchkorhr_col.f derrorhr_col.f dorhr_col01.f dorhr_col02.f)
 if(USE_XBLAS)
  list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f
@ -198,7 +198,7 @@ set(ZLINTST zchkaa.f
   zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f
   zchklqt.f zchklqtp.f zchktsqr.f
   zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f
-   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f)
+   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f zunhr_col02.f)
 if(USE_XBLAS)
  list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@ -74,7 +74,7 @@ SLINTST = schkaa.o \
   sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \
   schklqt.o schklqtp.o schktsqr.o \
   serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \
-   schkorhr_col.o serrorhr_col.o sorhr_col01.o
+   schkorhr_col.o serrorhr_col.o sorhr_col01.o sorhr_col02.o
 ifdef USEXBLAS
 SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \
@ -123,7 +123,7 @@ CLINTST = cchkaa.o \
   cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \
   cchklqt.o cchklqtp.o cchktsqr.o \
   cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \
-   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o
+   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o cunhr_col02.o
 ifdef USEXBLAS
 CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \
@ -167,7 +167,7 @@ DLINTST = dchkaa.o \
   dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \
   dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \
   derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \
-   dchkorhr_col.o derrorhr_col.o dorhr_col01.o
+   dchkorhr_col.o derrorhr_col.o dorhr_col01.o dorhr_col02.o
 ifdef USEXBLAS
 DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \
@ -215,7 +215,7 @@ ZLINTST = zchkaa.o \
   zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \
   zchklqt.o zchklqtp.o zchktsqr.o \
   zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \
-   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o
+   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o zunhr_col02.o
 ifdef USEXBLAS
 ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \
--- a/lapack-netlib/TESTING/LIN/cchkunhr_col.f
+++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR
+*> CCHKUNHR_COL tests:
-*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested
+*>   1) CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT,
-*> before this test.
+*>   2) CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT
 *>      (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT.
 *> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \date November 2019
 *
 *> \ingroup complex_lin
 *
 *  =====================================================================
-      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
-     $                         NBVAL, NOUT )
+     $                         NNB, NBVAL, NOUT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
 *     .. Scalar Arguments ..
      LOGICAL            TSTERR
@ -135,10 +135,11 @@
      REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01
+      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01,
     $                   CUNHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      LOGICAL            LERR, OK
@ -201,8 +202,8 @@
 *
 *                             Test CUNHR_COL
 *
-                              CALL CUNHR_COL01( M, N, MB1, NB1, NB2,
+                              CALL CUNHR_COL01( M, N, MB1, NB1,
-     $                                          RESULT )
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@ -226,12 +227,78 @@
         END DO
      END DO
 *
 *     Do for each value of M in MVAL.
 *
      DO I = 1, NM
         M = MVAL( I )
 *
 *        Do for each value of N in NVAL.
 *
         DO J = 1, NN
            N = NVAL( J )
 *
 *           Only for M >= N
 *
            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
 *
 *              Do for each possible value of MB1
 *
               DO IMB1 = 1, NNB
                  MB1 = NBVAL( IMB1 )
 *
 *                 Only for MB1 > N
 *
                  IF ( MB1.GT.N ) THEN
 *
 *                    Do for each possible value of NB1
 *
                     DO INB1 = 1, NNB
                        NB1 = NBVAL( INB1 )
 *
 *                       Do for each possible value of NB2
 *
                        DO INB2 = 1, NNB
                           NB2 = NBVAL( INB2 )
 *
                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
 *
 *                             Test CUNHR_COL
 *
                              CALL CUNHR_COL02( M, N, MB1, NB1,
     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
 *
                              DO T = 1, NTESTS
                                 IF( RESULT( T ).GE.THRESH ) THEN
                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
     $                              CALL ALAHD( NOUT, PATH )
                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
     $                                     NB1, NB2, T, RESULT( T )
                                    NFAIL = NFAIL + 1
                                 END IF
                              END DO
                              NRUN = NRUN + NTESTS
                           END IF
                        END DO
                     END DO
                  END IF
                END DO
            END IF
         END DO
      END DO
 *
 *     Print a summary of the results.
 *
      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+ 9999 FORMAT( 'CUNGTSQR and CUNHR_COL: M=', I5, ', N=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
 9998 FORMAT( 'CUNGTSQR_ROW and CUNHR_COL: M=', I5, ', N=', I5,
     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
      RETURN
 *
 *     End of CCHKUNHR_COL
--- a/lapack-netlib/TESTING/LIN/cunhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f
@ -13,7 +13,7 @@
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
-*       REAL              RESULT(6)
+*       DOUBLE PRECISION  RESULT(6)
 *
 *
 *> \par Purpose:
@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR.
+*> CUNHR_COL01 tests CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT.
-*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
 *>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>            A is a m-by-n test input matrix to be factored.
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>            so that A = Q_gr * ( R )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>                               ( 0 ),
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            of factorization in blocked WY-representation,
 *>            stored in CGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using CGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using CGEMM.
 *> \endverbatim
 *
 *  Authors:
@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
+*> \ingroup complex_lin
 *
 *> \ingroup complex16_lin
 *
 *  =====================================================================
      SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     November 2019
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
@ -102,10 +131,10 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      COMPLEX, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      COMPLEX         , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
-      REAL, ALLOCATABLE :: RWORK(:)
+      REAL            , ALLOCATABLE :: RWORK(:)
 *
 *     .. Parameters ..
      REAL               ZERO
@ -218,7 +247,7 @@
 *     Copy the factor R into the array R.
 *
      SRNAMT = 'CLACPY'
-      CALL CLACPY( 'U', M, N, AF, M, R, M )
+      CALL CLACPY( 'U', N, N, AF, M, R, M )
 *
 *     Reconstruct the orthogonal matrix Q.
 *
@ -240,7 +269,7 @@
 *     matrix S.
 *
      SRNAMT = 'CLACPY'
-      CALL CLACPY( 'U', M, N, R, M, AF, M )
+      CALL CLACPY( 'U', N, N, R, M, AF, M )
 *
      DO I = 1, N
         IF( DIAG( I ).EQ.-CONE ) THEN
--- a/lapack-netlib/TESTING/LIN/cunhr_col02.f
+++ b/lapack-netlib/TESTING/LIN/cunhr_col02.f
@ -0,0 +1,381 @@
 *> \brief \b CUNHR_COL02
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
 *       REAL              RESULT(6)
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CUNHR_COL02 tests CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT
 *> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT.
 *> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          Number of rows in test matrix.
 *> \endverbatim
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          Number of columns in test matrix.
 *> \endverbatim
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          Number of row in row block in an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          Number of columns in column block an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          Number of columns in column block in an output test matrix.
 *> \endverbatim
 *>
 *> \param[out] RESULT
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
 *>
 *>            A is a m-by-n test input matrix to be factored.
 *>            so that A = Q_gr * ( R )
 *>                               ( 0 ),
 *>
 *>            Q_qr is an implicit m-by-m unitary Q matrix, the result
 *>            of factorization in blocked WY-representation,
 *>            stored in CGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using CGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using CGEMM.
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex_lin
 *
 *  =====================================================================
      SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
 *  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
 *     .. Return values ..
      REAL              RESULT(6)
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local allocatable arrays
      COMPLEX         , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
      REAL            , ALLOCATABLE :: RWORK(:)
 *
 *     .. Parameters ..
      REAL               ZERO
      PARAMETER          ( ZERO = 0.0E+0 )
      COMPLEX            CONE, CZERO
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            TESTZEROS
      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
      REAL               ANORM, EPS, RESID, CNORM, DNORM
 *     ..
 *     .. Local Arrays ..
      INTEGER            ISEED( 4 )
      COMPLEX            WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
      REAL               SLAMCH, CLANGE, CLANSY
      EXTERNAL           SLAMCH, CLANGE, CLANSY
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CLACPY, CLARNV, CLASET, CGETSQRHRT,
     $                   CSCAL, CGEMM, CGEMQRT, CHERK
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, REAL, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      CHARACTER(LEN=32)  SRNAMT
 *     ..
 *     .. Common blocks ..
      COMMON             / SRMNAMC / SRNAMT
 *     ..
 *     .. Data statements ..
      DATA ISEED / 1988, 1989, 1990, 1991 /
 *
 *     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
 *
      TESTZEROS = .FALSE.
 *
      EPS = SLAMCH( 'Epsilon' )
      K = MIN( M, N )
      L = MAX( M, N, 1)
 *
 *     Dynamically allocate local arrays
 *
      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
     $           C(M,N), CF(M,N),
     $           D(N,M), DF(N,M) )
 *
 *     Put random numbers into A and copy to AF
 *
      DO J = 1, N
         CALL CLARNV( 2, ISEED, M, A( 1, J ) )
      END DO
      IF( TESTZEROS ) THEN
         IF( M.GE.4 ) THEN
            DO J = 1, N
               CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) )
            END DO
         END IF
      END IF
      CALL CLACPY( 'Full', M, N, A, M, AF, M )
 *
 *     Number of row blocks in CLATSQR
 *
      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
 *
      ALLOCATE ( T1( NB1, N * NRB ) )
      ALLOCATE ( T2( NB2, N ) )
      ALLOCATE ( DIAG( N ) )
 *
 *     Begin determine LWORK for the array WORK and allocate memory.
 *
 *     CGEMQRT requires NB2 to be bounded by N.
 *
      NB2_UB = MIN( NB2, N)
 *
 *
      CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORKQUERY, -1, INFO )
 *
      LWORK = INT( WORKQUERY( 1 ) )
 *
 *     In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
 *                or  M*NB2_UB if SIDE = 'R'.
 *
      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
 *
      ALLOCATE ( WORK( LWORK ) )
 *
 *     End allocate memory for WORK.
 *
 *
 *     Begin Householder reconstruction routines
 *
 *     Factor the matrix A in the array AF.
 *
      SRNAMT = 'CGETSQRHRT'
      CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORK, LWORK, INFO )
 *
 *     End Householder reconstruction routines.
 *
 *
 *     Generate the m-by-m matrix Q
 *
      CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M )
 *
      SRNAMT = 'CGEMQRT'
      CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
     $              WORK, INFO )
 *
 *     Copy R
 *
      CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M )
 *
      CALL CLACPY( 'Upper', M, N, AF, M, R, M )
 *
 *     TEST 1
 *     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
 *
      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
 *
      ANORM = CLANGE( '1', M, N, A, M, RWORK )
      RESID = CLANGE( '1', M, N, R, M, RWORK )
      IF( ANORM.GT.ZERO ) THEN
         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
      ELSE
         RESULT( 1 ) = ZERO
      END IF
 *
 *     TEST 2
 *     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
 *
      CALL CLASET( 'Full', M, M, CZERO, CONE, R, M )
      CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
      RESID = CLANSY( '1', 'Upper', M, R, M, RWORK )
      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
 *
 *     Generate random m-by-n matrix C
 *
      DO J = 1, N
         CALL CLARNV( 2, ISEED, M, C( 1, J ) )
      END DO
      CNORM = CLANGE( '1', M, N, C, M, RWORK )
      CALL CLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as Q*C = CF
 *
      SRNAMT = 'CGEMQRT'
      CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 3
 *     Compute |CF - Q*C| / ( eps *  m * |C| )
 *
      CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
      RESID = CLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 3 ) = ZERO
      END IF
 *
 *     Copy C into CF again
 *
      CALL CLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as (Q**T)*C = CF
 *
      SRNAMT = 'CGEMQRT'
      CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 4
 *     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
 *
      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
      RESID = CLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 4 ) = ZERO
      END IF
 *
 *     Generate random n-by-m matrix D and a copy DF
 *
      DO J = 1, M
         CALL CLARNV( 2, ISEED, N, D( 1, J ) )
      END DO
      DNORM = CLANGE( '1', N, M, D, N, RWORK )
      CALL CLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*Q = DF
 *
      SRNAMT = 'CGEMQRT'
      CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 5
 *     Compute |DF - D*Q| / ( eps * m * |D| )
 *
      CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
      RESID = CLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 5 ) = ZERO
      END IF
 *
 *     Copy D into DF again
 *
      CALL CLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*QT = DF
 *
      SRNAMT = 'CGEMQRT'
      CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 6
 *     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
 *
      CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
      RESID = CLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 6 ) = ZERO
      END IF
 *
 *     Deallocate all arrays
 *
      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
     $             C, D, CF, DF )
 *
      RETURN
 *
 *     End of CUNHR_COL02
 *
      END
--- a/lapack-netlib/TESTING/LIN/dchkorhr_col.f
+++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR
+*> DCHKORHR_COL tests:
-*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested
+*>   1) DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT,
-*> before this test.
+*>   2) DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT
 *>      (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT.
 *> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \date November 2019
 *
 *> \ingroup double_lin
 *
 *  =====================================================================
-      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
-     $                         NBVAL, NOUT )
+     $                         NNB, NBVAL, NOUT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
 *     .. Scalar Arguments ..
      LOGICAL            TSTERR
@ -135,10 +135,11 @@
      DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01
+      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01,
     $                   DORHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      LOGICAL            LERR, OK
@ -201,8 +202,8 @@
 *
 *                             Test DORHR_COL
 *
-                              CALL DORHR_COL01( M, N, MB1, NB1, NB2,
+                              CALL DORHR_COL01( M, N, MB1, NB1,
-     $                                          RESULT )
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@ -226,12 +227,78 @@
         END DO
      END DO
 *
 *     Do for each value of M in MVAL.
 *
      DO I = 1, NM
         M = MVAL( I )
 *
 *        Do for each value of N in NVAL.
 *
         DO J = 1, NN
            N = NVAL( J )
 *
 *           Only for M >= N
 *
            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
 *
 *              Do for each possible value of MB1
 *
               DO IMB1 = 1, NNB
                  MB1 = NBVAL( IMB1 )
 *
 *                 Only for MB1 > N
 *
                  IF ( MB1.GT.N ) THEN
 *
 *                    Do for each possible value of NB1
 *
                     DO INB1 = 1, NNB
                        NB1 = NBVAL( INB1 )
 *
 *                       Do for each possible value of NB2
 *
                        DO INB2 = 1, NNB
                           NB2 = NBVAL( INB2 )
 *
                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
 *
 *                             Test DORHR_COL
 *
                              CALL DORHR_COL02( M, N, MB1, NB1,
     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
 *
                              DO T = 1, NTESTS
                                 IF( RESULT( T ).GE.THRESH ) THEN
                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
     $                              CALL ALAHD( NOUT, PATH )
                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
     $                                     NB1, NB2, T, RESULT( T )
                                    NFAIL = NFAIL + 1
                                 END IF
                              END DO
                              NRUN = NRUN + NTESTS
                           END IF
                        END DO
                     END DO
                  END IF
                END DO
            END IF
         END DO
      END DO
 *
 *     Print a summary of the results.
 *
      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+ 9999 FORMAT( 'DORGTSQR and DORHR_COL: M=', I5, ', N=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
 9998 FORMAT( 'DORGTSQR_ROW and DORHR_COL: M=', I5, ', N=', I5,
     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
      RETURN
 *
 *     End of DCHKORHR_COL
--- a/lapack-netlib/TESTING/LIN/dorhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f
@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR.
+*> DORHR_COL01 tests DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT.
-*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
 *>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>            A is a m-by-n test input matrix to be factored.
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>            so that A = Q_gr * ( R )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>                               ( 0 ),
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            of factorization in blocked WY-representation,
 *>            stored in ZGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using DGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using DGEMM.
 *> \endverbatim
 *
 *  Authors:
@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
+*> \ingroup double_lin
 *
 *> \ingroup single_lin
 *
 *  =====================================================================
      SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     November 2019
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
--- a/lapack-netlib/TESTING/LIN/dorhr_col02.f
+++ b/lapack-netlib/TESTING/LIN/dorhr_col02.f
@ -0,0 +1,377 @@
 *> \brief \b DORHR_COL02
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
 *       DOUBLE PRECISION  RESULT(6)
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DORHR_COL02 tests DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT
 *> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT.
 *> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          Number of rows in test matrix.
 *> \endverbatim
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          Number of columns in test matrix.
 *> \endverbatim
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          Number of row in row block in an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          Number of columns in column block an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          Number of columns in column block in an output test matrix.
 *> \endverbatim
 *>
 *> \param[out] RESULT
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
 *>
 *>            A is a m-by-n test input matrix to be factored.
 *>            so that A = Q_gr * ( R )
 *>                               ( 0 ),
 *>
 *>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
 *>            of factorization in blocked WY-representation,
 *>            stored in ZGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using DGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using DGEMM.
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup double_lin
 *
 *  =====================================================================
      SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
 *  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
 *     .. Return values ..
      DOUBLE PRECISION  RESULT(6)
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local allocatable arrays
      DOUBLE PRECISION, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE, ZERO
      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            TESTZEROS
      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
 *     ..
 *     .. Local Arrays ..
      INTEGER            ISEED( 4 )
      DOUBLE PRECISION   WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH, DLANGE, DLANSY
      EXTERNAL           DLAMCH, DLANGE, DLANSY
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DLACPY, DLARNV, DLASET, DGETSQRHRT,
     $                   DSCAL, DGEMM, DGEMQRT, DSYRK
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, DBLE, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      CHARACTER(LEN=32)  SRNAMT
 *     ..
 *     .. Common blocks ..
      COMMON             / SRMNAMC / SRNAMT
 *     ..
 *     .. Data statements ..
      DATA ISEED / 1988, 1989, 1990, 1991 /
 *
 *     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
 *
      TESTZEROS = .FALSE.
 *
      EPS = DLAMCH( 'Epsilon' )
      K = MIN( M, N )
      L = MAX( M, N, 1)
 *
 *     Dynamically allocate local arrays
 *
      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
     $           C(M,N), CF(M,N),
     $           D(N,M), DF(N,M) )
 *
 *     Put random numbers into A and copy to AF
 *
      DO J = 1, N
         CALL DLARNV( 2, ISEED, M, A( 1, J ) )
      END DO
      IF( TESTZEROS ) THEN
         IF( M.GE.4 ) THEN
            DO J = 1, N
               CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) )
            END DO
         END IF
      END IF
      CALL DLACPY( 'Full', M, N, A, M, AF, M )
 *
 *     Number of row blocks in DLATSQR
 *
      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
 *
      ALLOCATE ( T1( NB1, N * NRB ) )
      ALLOCATE ( T2( NB2, N ) )
      ALLOCATE ( DIAG( N ) )
 *
 *     Begin determine LWORK for the array WORK and allocate memory.
 *
 *     DGEMQRT requires NB2 to be bounded by N.
 *
      NB2_UB = MIN( NB2, N)
 *
 *
      CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORKQUERY, -1, INFO )
 *
      LWORK = INT( WORKQUERY( 1 ) )
 *
 *     In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
 *                or  M*NB2_UB if SIDE = 'R'.
 *
      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
 *
      ALLOCATE ( WORK( LWORK ) )
 *
 *     End allocate memory for WORK.
 *
 *
 *     Begin Householder reconstruction routines
 *
 *     Factor the matrix A in the array AF.
 *
      SRNAMT = 'DGETSQRHRT'
      CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORK, LWORK, INFO )
 *
 *     End Householder reconstruction routines.
 *
 *
 *     Generate the m-by-m matrix Q
 *
      CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M )
 *
      SRNAMT = 'DGEMQRT'
      CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
     $              WORK, INFO )
 *
 *     Copy R
 *
      CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M )
 *
      CALL DLACPY( 'Upper', M, N, AF, M, R, M )
 *
 *     TEST 1
 *     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
 *
      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
 *
      ANORM = DLANGE( '1', M, N, A, M, RWORK )
      RESID = DLANGE( '1', M, N, R, M, RWORK )
      IF( ANORM.GT.ZERO ) THEN
         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
      ELSE
         RESULT( 1 ) = ZERO
      END IF
 *
 *     TEST 2
 *     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
 *
      CALL DLASET( 'Full', M, M, ZERO, ONE, R, M )
      CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
      RESID = DLANSY( '1', 'Upper', M, R, M, RWORK )
      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
 *
 *     Generate random m-by-n matrix C
 *
      DO J = 1, N
         CALL DLARNV( 2, ISEED, M, C( 1, J ) )
      END DO
      CNORM = DLANGE( '1', M, N, C, M, RWORK )
      CALL DLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as Q*C = CF
 *
      SRNAMT = 'DGEMQRT'
      CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 3
 *     Compute |CF - Q*C| / ( eps *  m * |C| )
 *
      CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
      RESID = DLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 3 ) = ZERO
      END IF
 *
 *     Copy C into CF again
 *
      CALL DLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as (Q**T)*C = CF
 *
      SRNAMT = 'DGEMQRT'
      CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 4
 *     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
 *
      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
      RESID = DLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 4 ) = ZERO
      END IF
 *
 *     Generate random n-by-m matrix D and a copy DF
 *
      DO J = 1, M
         CALL DLARNV( 2, ISEED, N, D( 1, J ) )
      END DO
      DNORM = DLANGE( '1', N, M, D, N, RWORK )
      CALL DLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*Q = DF
 *
      SRNAMT = 'DGEMQRT'
      CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 5
 *     Compute |DF - D*Q| / ( eps * m * |D| )
 *
      CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
      RESID = DLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 5 ) = ZERO
      END IF
 *
 *     Copy D into DF again
 *
      CALL DLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*QT = DF
 *
      SRNAMT = 'DGEMQRT'
      CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 6
 *     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
 *
      CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
      RESID = DLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 6 ) = ZERO
      END IF
 *
 *     Deallocate all arrays
 *
      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
     $             C, D, CF, DF )
 *
      RETURN
 *
 *     End of DORHR_COL02
 *
      END
--- a/lapack-netlib/TESTING/LIN/schkorhr_col.f
+++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f
@ -24,8 +24,11 @@
 *>
 *> \verbatim
 *>
-*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
+*> SCHKORHR_COL tests:
-*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*>   1) SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT,
 *>   2) SORGTSQR_ROW and SORHR_COL inside DGETSQRHRT
 *>      (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT.
 *> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
+*> \ingroup single_lin
 *
 *> \ingroup sigle_lin
 *
 *  =====================================================================
-      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
-     $                         NBVAL, NOUT )
+     $                         NNB, NBVAL, NOUT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     June 2019
 *
 *     .. Scalar Arguments ..
      LOGICAL            TSTERR
@ -135,7 +135,8 @@
      REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01
+      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01,
     $                   SORHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          MAX, MIN
@ -201,8 +202,8 @@
 *
 *                             Test SORHR_COL
 *
-                              CALL SORHR_COL01( M, N, MB1, NB1, NB2,
+                              CALL SORHR_COL01( M, N, MB1, NB1,
-     $                                          RESULT )
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@ -226,12 +227,78 @@
         END DO
      END DO
 *
 *     Do for each value of M in MVAL.
 *
      DO I = 1, NM
         M = MVAL( I )
 *
 *        Do for each value of N in NVAL.
 *
         DO J = 1, NN
            N = NVAL( J )
 *
 *           Only for M >= N
 *
            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
 *
 *              Do for each possible value of MB1
 *
               DO IMB1 = 1, NNB
                  MB1 = NBVAL( IMB1 )
 *
 *                 Only for MB1 > N
 *
                  IF ( MB1.GT.N ) THEN
 *
 *                    Do for each possible value of NB1
 *
                     DO INB1 = 1, NNB
                        NB1 = NBVAL( INB1 )
 *
 *                       Do for each possible value of NB2
 *
                        DO INB2 = 1, NNB
                           NB2 = NBVAL( INB2 )
 *
                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
 *
 *                             Test SORHR_COL
 *
                              CALL SORHR_COL02( M, N, MB1, NB1,
     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
 *
                              DO T = 1, NTESTS
                                 IF( RESULT( T ).GE.THRESH ) THEN
                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
     $                              CALL ALAHD( NOUT, PATH )
                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
     $                                     NB1, NB2, T, RESULT( T )
                                    NFAIL = NFAIL + 1
                                 END IF
                              END DO
                              NRUN = NRUN + NTESTS
                           END IF
                        END DO
                     END DO
                  END IF
                END DO
            END IF
         END DO
      END DO
 *
 *     Print a summary of the results.
 *
      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+ 9999 FORMAT( 'SORGTSQR and SORHR_COL: M=', I5, ', N=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
 9998 FORMAT( 'SORGTSQR_ROW and SORHR_COL: M=', I5, ', N=', I5,
     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
      RETURN
 *
 *     End of SCHKORHR_COL
--- a/lapack-netlib/TESTING/LIN/sorhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f
@ -8,12 +8,12 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT)
+*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
-*       REAL             RESULT(6)
+*       REAL              RESULT(6)
 *
 *
 *> \par Purpose:
@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
+*> SORHR_COL01 tests SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT.
-*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
 *>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>            A is a m-by-n test input matrix to be factored.
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>            so that A = Q_gr * ( R )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>                               ( 0 ),
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            of factorization in blocked WY-representation,
 *>            stored in SGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using SGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using SGEMM.
 *> \endverbatim
 *
 *  Authors:
@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \date November 2019
 *
 *> \ingroup single_lin
 *
 *  =====================================================================
      SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     November 2019
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
@ -102,7 +131,7 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      REAL, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      REAL            , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
 *
@ -128,7 +157,7 @@
     $                   SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          CEILING, MAX, MIN, REAL
+      INTRINSIC          CEILING, REAL, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      CHARACTER(LEN=32)  SRNAMT
@ -230,7 +259,7 @@
 *
 *     Compute the factor R_hr corresponding to the Householder
 *     reconstructed Q_hr and place it in the upper triangle of AF to
-*     match the Q storage format in DGEQRT. R_hr = R_tsqr * S,
+*     match the Q storage format in SGEQRT. R_hr = R_tsqr * S,
 *     this means changing the sign of I-th row of the matrix R_tsqr
 *     according to sign of of I-th diagonal element DIAG(I) of the
 *     matrix S.
--- a/lapack-netlib/TESTING/LIN/sorhr_col02.f
+++ b/lapack-netlib/TESTING/LIN/sorhr_col02.f
@ -0,0 +1,376 @@
 *> \brief \b SORHR_COL02
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
 *       REAL              RESULT(6)
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SORHR_COL02 tests SORGTSQR_ROW and SORHR_COL inside SGETSQRHRT
 *> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT.
 *> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          Number of rows in test matrix.
 *> \endverbatim
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          Number of columns in test matrix.
 *> \endverbatim
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          Number of row in row block in an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          Number of columns in column block an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          Number of columns in column block in an output test matrix.
 *> \endverbatim
 *>
 *> \param[out] RESULT
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
 *>
 *>            A is a m-by-n test input matrix to be factored.
 *>            so that A = Q_gr * ( R )
 *>                               ( 0 ),
 *>
 *>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
 *>            of factorization in blocked WY-representation,
 *>            stored in SGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using SGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using SGEMM.
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup single_lin
 *
 *  =====================================================================
      SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
 *  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
 *     .. Return values ..
      REAL              RESULT(6)
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local allocatable arrays
      REAL            , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
 *
 *     .. Parameters ..
      REAL               ONE, ZERO
      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            TESTZEROS
      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
      REAL               ANORM, EPS, RESID, CNORM, DNORM
 *     ..
 *     .. Local Arrays ..
      INTEGER            ISEED( 4 )
      REAL               WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
      REAL               SLAMCH, SLANGE, SLANSY
      EXTERNAL           SLAMCH, SLANGE, SLANSY
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SLACPY, SLARNV, SLASET, SGETSQRHRT,
     $                   SSCAL, SGEMM, SGEMQRT, SSYRK
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, REAL, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      CHARACTER(LEN=32)  SRNAMT
 *     ..
 *     .. Common blocks ..
      COMMON             / SRMNAMC / SRNAMT
 *     ..
 *     .. Data statements ..
      DATA ISEED / 1988, 1989, 1990, 1991 /
 *
 *     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
 *
      TESTZEROS = .FALSE.
 *
      EPS = SLAMCH( 'Epsilon' )
      K = MIN( M, N )
      L = MAX( M, N, 1)
 *
 *     Dynamically allocate local arrays
 *
      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
     $           C(M,N), CF(M,N),
     $           D(N,M), DF(N,M) )
 *
 *     Put random numbers into A and copy to AF
 *
      DO J = 1, N
         CALL SLARNV( 2, ISEED, M, A( 1, J ) )
      END DO
      IF( TESTZEROS ) THEN
         IF( M.GE.4 ) THEN
            DO J = 1, N
               CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) )
            END DO
         END IF
      END IF
      CALL SLACPY( 'Full', M, N, A, M, AF, M )
 *
 *     Number of row blocks in SLATSQR
 *
      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
 *
      ALLOCATE ( T1( NB1, N * NRB ) )
      ALLOCATE ( T2( NB2, N ) )
      ALLOCATE ( DIAG( N ) )
 *
 *     Begin determine LWORK for the array WORK and allocate memory.
 *
 *     SGEMQRT requires NB2 to be bounded by N.
 *
      NB2_UB = MIN( NB2, N)
 *
      CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORKQUERY, -1, INFO )
 *
      LWORK = INT( WORKQUERY( 1 ) )
 *
 *     In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
 *                or  M*NB2_UB if SIDE = 'R'.
 *
      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
 *
      ALLOCATE ( WORK( LWORK ) )
 *
 *     End allocate memory for WORK.
 *
 *
 *     Begin Householder reconstruction routines
 *
 *     Factor the matrix A in the array AF.
 *
      SRNAMT = 'SGETSQRHRT'
      CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORK, LWORK, INFO )
 *
 *     End Householder reconstruction routines.
 *
 *
 *     Generate the m-by-m matrix Q
 *
      CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M )
 *
      SRNAMT = 'SGEMQRT'
      CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
     $              WORK, INFO )
 *
 *     Copy R
 *
      CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M )
 *
      CALL SLACPY( 'Upper', M, N, AF, M, R, M )
 *
 *     TEST 1
 *     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
 *
      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
 *
      ANORM = SLANGE( '1', M, N, A, M, RWORK )
      RESID = SLANGE( '1', M, N, R, M, RWORK )
      IF( ANORM.GT.ZERO ) THEN
         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
      ELSE
         RESULT( 1 ) = ZERO
      END IF
 *
 *     TEST 2
 *     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
 *
      CALL SLASET( 'Full', M, M, ZERO, ONE, R, M )
      CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
      RESID = SLANSY( '1', 'Upper', M, R, M, RWORK )
      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
 *
 *     Generate random m-by-n matrix C
 *
      DO J = 1, N
         CALL SLARNV( 2, ISEED, M, C( 1, J ) )
      END DO
      CNORM = SLANGE( '1', M, N, C, M, RWORK )
      CALL SLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as Q*C = CF
 *
      SRNAMT = 'SGEMQRT'
      CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 3
 *     Compute |CF - Q*C| / ( eps *  m * |C| )
 *
      CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
      RESID = SLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 3 ) = ZERO
      END IF
 *
 *     Copy C into CF again
 *
      CALL SLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as (Q**T)*C = CF
 *
      SRNAMT = 'SGEMQRT'
      CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 4
 *     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
 *
      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
      RESID = SLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 4 ) = ZERO
      END IF
 *
 *     Generate random n-by-m matrix D and a copy DF
 *
      DO J = 1, M
         CALL SLARNV( 2, ISEED, N, D( 1, J ) )
      END DO
      DNORM = SLANGE( '1', N, M, D, N, RWORK )
      CALL SLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*Q = DF
 *
      SRNAMT = 'SGEMQRT'
      CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 5
 *     Compute |DF - D*Q| / ( eps * m * |D| )
 *
      CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
      RESID = SLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 5 ) = ZERO
      END IF
 *
 *     Copy D into DF again
 *
      CALL SLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*QT = DF
 *
      SRNAMT = 'SGEMQRT'
      CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 6
 *     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
 *
      CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
      RESID = SLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 6 ) = ZERO
      END IF
 *
 *     Deallocate all arrays
 *
      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
     $             C, D, CF, DF )
 *
      RETURN
 *
 *     End of SORHR_COL02
 *
      END
--- a/lapack-netlib/TESTING/LIN/zchkunhr_col.f
+++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR
+*> ZCHKUNHR_COL tests:
-*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested
+*>   1) ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT,
-*> before this test.
+*>   2) ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT
 *>      (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT.
 *> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \date November 2019
 *
 *> \ingroup complex16_lin
 *
 *  =====================================================================
-      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
-     $                         NBVAL, NOUT )
+     $                         NNB, NBVAL, NOUT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
 *     .. Scalar Arguments ..
      LOGICAL            TSTERR
@ -135,10 +135,11 @@
      DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01
+      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01,
     $                   ZUNHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      LOGICAL            LERR, OK
@ -201,8 +202,8 @@
 *
 *                             Test ZUNHR_COL
 *
-                              CALL ZUNHR_COL01( M, N, MB1, NB1, NB2,
+                              CALL ZUNHR_COL01( M, N, MB1, NB1,
-     $                                          RESULT )
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@ -226,12 +227,78 @@
         END DO
      END DO
 *
 *     Do for each value of M in MVAL.
 *
      DO I = 1, NM
         M = MVAL( I )
 *
 *        Do for each value of N in NVAL.
 *
         DO J = 1, NN
            N = NVAL( J )
 *
 *           Only for M >= N
 *
            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
 *
 *              Do for each possible value of MB1
 *
               DO IMB1 = 1, NNB
                  MB1 = NBVAL( IMB1 )
 *
 *                 Only for MB1 > N
 *
                  IF ( MB1.GT.N ) THEN
 *
 *                    Do for each possible value of NB1
 *
                     DO INB1 = 1, NNB
                        NB1 = NBVAL( INB1 )
 *
 *                       Do for each possible value of NB2
 *
                        DO INB2 = 1, NNB
                           NB2 = NBVAL( INB2 )
 *
                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
 *
 *                             Test ZUNHR_COL
 *
                              CALL ZUNHR_COL02( M, N, MB1, NB1,
     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
 *
                              DO T = 1, NTESTS
                                 IF( RESULT( T ).GE.THRESH ) THEN
                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
     $                              CALL ALAHD( NOUT, PATH )
                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
     $                                     NB1, NB2, T, RESULT( T )
                                    NFAIL = NFAIL + 1
                                 END IF
                              END DO
                              NRUN = NRUN + NTESTS
                           END IF
                        END DO
                     END DO
                  END IF
                END DO
            END IF
         END DO
      END DO
 *
 *     Print a summary of the results.
 *
      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+ 9999 FORMAT( 'ZUNGTSQR and ZUNHR_COL: M=', I5, ', N=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
 9998 FORMAT( 'ZUNGTSQR_ROW and ZUNHR_COL: M=', I5, ', N=', I5,
     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
     $        ' test(', I2, ')=', G12.5 )
      RETURN
 *
 *     End of ZCHKUNHR_COL
--- a/lapack-netlib/TESTING/LIN/zunhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f
@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR.
+*> ZUNHR_COL01 tests ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT.
-*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
 *>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>            A is a m-by-n test input matrix to be factored.
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>            so that A = Q_gr * ( R )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>                               ( 0 ),
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            of factorization in blocked WY-representation,
 *>            stored in ZGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using ZGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using ZGEMM.
 *> \endverbatim
 *
 *  Authors:
@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \date November 2019
 *
 *> \ingroup complex16_lin
 *
 *  =====================================================================
      SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     November 2019
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
@ -102,7 +131,7 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      COMPLEX*16, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      COMPLEX*16      , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
      DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
@ -218,7 +247,7 @@
 *     Copy the factor R into the array R.
 *
      SRNAMT = 'ZLACPY'
-      CALL ZLACPY( 'U', M, N, AF, M, R, M )
+      CALL ZLACPY( 'U', N, N, AF, M, R, M )
 *
 *     Reconstruct the orthogonal matrix Q.
 *
@ -240,7 +269,7 @@
 *     matrix S.
 *
      SRNAMT = 'ZLACPY'
-      CALL ZLACPY( 'U', M, N, R, M, AF, M )
+      CALL ZLACPY( 'U', N, N, R, M, AF, M )
 *
      DO I = 1, N
         IF( DIAG( I ).EQ.-CONE ) THEN
--- a/lapack-netlib/TESTING/LIN/zunhr_col02.f
+++ b/lapack-netlib/TESTING/LIN/zunhr_col02.f
@ -0,0 +1,381 @@
 *> \brief \b ZUNHR_COL02
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
 *       DOUBLE PRECISION  RESULT(6)
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZUNHR_COL02 tests ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT
 *> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT.
 *> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          Number of rows in test matrix.
 *> \endverbatim
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          Number of columns in test matrix.
 *> \endverbatim
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          Number of row in row block in an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          Number of columns in column block an input test matrix.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          Number of columns in column block in an output test matrix.
 *> \endverbatim
 *>
 *> \param[out] RESULT
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
 *>
 *>            A is a m-by-n test input matrix to be factored.
 *>            so that A = Q_gr * ( R )
 *>                               ( 0 ),
 *>
 *>            Q_qr is an implicit m-by-m unitary Q matrix, the result
 *>            of factorization in blocked WY-representation,
 *>            stored in ZGEQRT output format.
 *>
 *>            R is a n-by-n upper-triangular matrix,
 *>
 *>            0 is a (m-n)-by-n zero matrix,
 *>
 *>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
 *>
 *>            C is an m-by-n random matrix,
 *>
 *>            D is an n-by-m random matrix.
 *>
 *>          The six tests are:
 *>
 *>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
 *>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
 *>
 *>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
 *>
 *>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
 *>
 *>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
 *>
 *>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
 *>
 *>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
 *>
 *>          where:
 *>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
 *>            computed using ZGEMQRT,
 *>
 *>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
 *>            computed using ZGEMM.
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16_lin
 *
 *  =====================================================================
      SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
      IMPLICIT NONE
 *
 *  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           M, N, MB1, NB1, NB2
 *     .. Return values ..
      DOUBLE PRECISION  RESULT(6)
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local allocatable arrays
      COMPLEX*16      , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
      DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ZERO
      PARAMETER          ( ZERO = 0.0D+0 )
      COMPLEX*16         CONE, CZERO
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            TESTZEROS
      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
 *     ..
 *     .. Local Arrays ..
      INTEGER            ISEED( 4 )
      COMPLEX*16         WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANSY
      EXTERNAL           DLAMCH, ZLANGE, ZLANSY
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZLACPY, ZLARNV, ZLASET, ZGETSQRHRT,
     $                   ZSCAL, ZGEMM, ZGEMQRT, ZHERK
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, DBLE, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
      CHARACTER(LEN=32)  SRNAMT
 *     ..
 *     .. Common blocks ..
      COMMON             / SRMNAMC / SRNAMT
 *     ..
 *     .. Data statements ..
      DATA ISEED / 1988, 1989, 1990, 1991 /
 *
 *     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
 *
      TESTZEROS = .FALSE.
 *
      EPS = DLAMCH( 'Epsilon' )
      K = MIN( M, N )
      L = MAX( M, N, 1)
 *
 *     Dynamically allocate local arrays
 *
      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
     $           C(M,N), CF(M,N),
     $           D(N,M), DF(N,M) )
 *
 *     Put random numbers into A and copy to AF
 *
      DO J = 1, N
         CALL ZLARNV( 2, ISEED, M, A( 1, J ) )
      END DO
      IF( TESTZEROS ) THEN
         IF( M.GE.4 ) THEN
            DO J = 1, N
               CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) )
            END DO
         END IF
      END IF
      CALL ZLACPY( 'Full', M, N, A, M, AF, M )
 *
 *     Number of row blocks in ZLATSQR
 *
      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
 *
      ALLOCATE ( T1( NB1, N * NRB ) )
      ALLOCATE ( T2( NB2, N ) )
      ALLOCATE ( DIAG( N ) )
 *
 *     Begin determine LWORK for the array WORK and allocate memory.
 *
 *     ZGEMQRT requires NB2 to be bounded by N.
 *
      NB2_UB = MIN( NB2, N)
 *
 *
      CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORKQUERY, -1, INFO )
 *
      LWORK = INT( WORKQUERY( 1 ) )
 *
 *     In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
 *                or  M*NB2_UB if SIDE = 'R'.
 *
      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
 *
      ALLOCATE ( WORK( LWORK ) )
 *
 *     End allocate memory for WORK.
 *
 *
 *     Begin Householder reconstruction routines
 *
 *     Factor the matrix A in the array AF.
 *
      SRNAMT = 'ZGETSQRHRT'
      CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
     $                 WORK, LWORK, INFO )
 *
 *     End Householder reconstruction routines.
 *
 *
 *     Generate the m-by-m matrix Q
 *
      CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M )
 *
      SRNAMT = 'ZGEMQRT'
      CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
     $              WORK, INFO )
 *
 *     Copy R
 *
      CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M )
 *
      CALL ZLACPY( 'Upper', M, N, AF, M, R, M )
 *
 *     TEST 1
 *     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
 *
      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
 *
      ANORM = ZLANGE( '1', M, N, A, M, RWORK )
      RESID = ZLANGE( '1', M, N, R, M, RWORK )
      IF( ANORM.GT.ZERO ) THEN
         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
      ELSE
         RESULT( 1 ) = ZERO
      END IF
 *
 *     TEST 2
 *     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
 *
      CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M )
      CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
      RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK )
      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
 *
 *     Generate random m-by-n matrix C
 *
      DO J = 1, N
         CALL ZLARNV( 2, ISEED, M, C( 1, J ) )
      END DO
      CNORM = ZLANGE( '1', M, N, C, M, RWORK )
      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as Q*C = CF
 *
      SRNAMT = 'ZGEMQRT'
      CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 3
 *     Compute |CF - Q*C| / ( eps *  m * |C| )
 *
      CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 3 ) = ZERO
      END IF
 *
 *     Copy C into CF again
 *
      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
 *
 *     Apply Q to C as (Q**T)*C = CF
 *
      SRNAMT = 'ZGEMQRT'
      CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
     $               WORK, INFO )
 *
 *     TEST 4
 *     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
 *
      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
      IF( CNORM.GT.ZERO ) THEN
         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
      ELSE
         RESULT( 4 ) = ZERO
      END IF
 *
 *     Generate random n-by-m matrix D and a copy DF
 *
      DO J = 1, M
         CALL ZLARNV( 2, ISEED, N, D( 1, J ) )
      END DO
      DNORM = ZLANGE( '1', N, M, D, N, RWORK )
      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*Q = DF
 *
      SRNAMT = 'ZGEMQRT'
      CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 5
 *     Compute |DF - D*Q| / ( eps * m * |D| )
 *
      CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 5 ) = ZERO
      END IF
 *
 *     Copy D into DF again
 *
      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
 *
 *     Apply Q to D as D*QT = DF
 *
      SRNAMT = 'ZGEMQRT'
      CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
     $               WORK, INFO )
 *
 *     TEST 6
 *     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
 *
      CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
      IF( DNORM.GT.ZERO ) THEN
         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
      ELSE
         RESULT( 6 ) = ZERO
      END IF
 *
 *     Deallocate all arrays
 *
      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
     $             C, D, CF, DF )
 *
      RETURN
 *
 *     End of ZUNHR_COL02
 *
      END