diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 3b221d420..82511d41b 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -123,7 +123,8 @@ set(SLASRC ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f + slarmm.f slatrs3.f strsyl3.f) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -221,7 +222,8 @@ set(CLASRC cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cungtsqr_row.f cunhr_col.f ) + cungtsqr.f cungtsqr_row.f cunhr_col.f + clatrs3.f ctrsyl3.f ) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -313,7 +315,8 @@ set(DLASRC dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f + dlarmm.f dlatrs3.f dtrsyl3.f) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -415,7 +418,8 @@ set(ZLASRC zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zungtsqr_row.f zunhr_col.f) + zungtsqr.f zungtsqr_row.f zunhr_col.f + zlatrs3.f ztrsyl3.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f @@ -617,7 +621,8 @@ set(SLASRC ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c sgesvdq.c slaorhr_col_getrfnp.c - slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c ) + slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c + slarmm.c slatrs3.c strsyl3.c) set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c @@ -714,7 +719,8 @@ set(CLASRC cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c - cungtsqr.c cungtsqr_row.c cunhr_col.c ) + cungtsqr.c cungtsqr_row.c cunhr_col.c + clatrs3.c ctrsyl3.c) set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c @@ -805,7 +811,8 @@ set(DLASRC dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c - dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c ) + dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c + dlarmm.c dlatrs3.c dtrsyl3.c) set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c @@ -906,7 +913,7 @@ set(ZLASRC zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c) + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c) set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 14695fdc8..b5a276f5a 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,6 +12,7 @@ #include #include +#include /* It seems all current Fortran compilers put strlen at end. * Some historical compilers put strlen after the str argument @@ -80,11 +81,26 @@ extern "C" { /*----------------------------------------------------------------------------*/ #ifndef lapack_int -#define lapack_int int +#if defined(LAPACK_ILP64) +#define lapack_int int64_t +#else +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 +#endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif /* f2c, hence clapack and MacOS Accelerate, returns double instead of float @@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2) ( const lapack_complex_double*, const lapack_complex_double* ); #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME) -lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, +lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, lapack_int lca, lapack_int lcb #ifdef LAPACK_FORTRAN_STRLEN_END , size_t, size_t @@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) +void LAPACK_ctrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, float* scale, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) +void LAPACK_dtrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, double* scale, + lapack_int* iwork, lapack_int const* liwork, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) +void LAPACK_strsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, float* scale, + lapack_int* iwork, lapack_int const* liwork, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) +void LAPACK_ztrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, double* scale, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) void LAPACK_ctrtri_base( char const* uplo, char const* diag, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index f6fbfcc33..9998b1504 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n, float LAPACKE_slamch( char cmach ); double LAPACKE_dlamch( char cmach ); +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ); +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ); +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ); +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ); + float LAPACKE_slange( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda ); double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m, @@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ); +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ); +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ); + lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n, @@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z ); float LAPACKE_slamch_work( char cmach ); double LAPACKE_dlamch_work( char cmach ); +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ); +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ); +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ); +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ); + float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* work ); @@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, + float* c, lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ); +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, + double* c, lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ); +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ); +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ); + lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag, diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 8406635e9..05ff8d57f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c new file mode 100644 index 000000000..c931aac48 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c new file mode 100644 index 000000000..09c08d92a --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c index 4e1b87681..4a0d427b3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c new file mode 100644 index 000000000..c95a772de --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c new file mode 100644 index 000000000..272c35b38 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, double* c, + lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + double* a_t = NULL; + double* b_t = NULL; + double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c index 0b6406dec..627d2406c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c new file mode 100644 index 000000000..1cfc626c2 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c new file mode 100644 index 000000000..3c50e4a45 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, float* c, + lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + float* a_t = NULL; + float* b_t = NULL; + float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c index 528b94a47..1d318e571 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c new file mode 100644 index 000000000..dbc9bcf9f --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c new file mode 100644 index 000000000..a7ebd5da6 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 03d15c23c..49eb69cfe 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o + sgesvdq.o slarmm.o slatrs3.o strsyl3.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o + cgesvdq.o clatrs3.o ctrsyl3.o endif ifdef USEXBLAS @@ -417,7 +417,7 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o + dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o endif ifdef USEXBLAS @@ -526,7 +526,7 @@ ZLASRC_O = \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ - zgesvdq.o + zgesvdq.o zlatrs3.o ztrsyl3.o endif ifdef USEXBLAS diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c new file mode 100644 index 000000000..f6d76cf49 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.c @@ -0,0 +1,1282 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int clatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, complex *a, integer *lda, complex * + x, integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + complex q__1; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + real rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer lscale; + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + integer *, complex *, integer *, complex *, real *, real *, + integer *); + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + clatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = clange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + csscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = clange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + csscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + csscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__6, &i__7, &i__8, &q__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("T", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of CLATRS3 */ + +} /* clatrs3_ */ + diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f new file mode 100644 index 000000000..a902f1ed0 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.f @@ -0,0 +1,666 @@ +*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( LDX, * ) + REAL CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CLATRS, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of CLATRS3 +* + END diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c new file mode 100644 index 000000000..3c119157c --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.c @@ -0,0 +1,2022 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complexSYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int ctrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, complex *a, integer *lda, complex *b, integer + *ldb, complex *c__, integer *ldc, real *scale, real *swork, integer * + ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3, r__4; + complex q__1; + + /* Local variables */ + real scal; + complex csgn; + real anrm, bnrm, cnrm; + integer awrk, bwrk; + real *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(real *); + integer nb, jj, ll; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + extern /* Subroutine */ int ctrsyl_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, real *, integer *); + logical lquery; + integer nba, nbb; + real buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (real*)malloc(f2cmax(*m,*n)*sizeof(real)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ctrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + q__1.r = sgn, q__1.i = 0.f; + csgn.r = q__1.r, csgn.i = q__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = clange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = clange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__4, &i__5, &i__6, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = clange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__4, &i__5, &i__6, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = clange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + csscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__3, &i__4, &i__5, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = clange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = clange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__2, &i__3, &i__4, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + r__3 = (r__1 = c__[i__1].r, abs(r__1)), r__4 = (r__2 = r_imag(&c__[ + c_dim1 + 1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + r__3 = scal, r__4 = (r__1 = c__[i__3].r, abs(r__1)), r__3 = + f2cmax(r__3,r__4), r__4 = (r__2 = r_imag(&c__[k + l * + c_dim1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + clascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return 0; + +/* End of CTRSYL3 */ + +} /* ctrsyl3_ */ + diff --git a/lapack-netlib/SRC/ctrsyl3.f b/lapack-netlib/SRC/ctrsyl3.f new file mode 100644 index 000000000..586dc0207 --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b CTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> CTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complexSYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + REAL SCALE +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) + REAL SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX CSGN +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL CLANGE, SLAMCH, SLARMM + EXTERNAL CLANGE, ILAENV, LSAME, SLAMCH, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CSSCAL, CGEMM, CLASCL, CTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'CTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) + CSGN = CMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( REAL( C( 1, 1 ) ) ), + $ ABS( AIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( REAL ( C( K, L ) ) ), + $ ABS( AIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL CLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of CTRSYL3 +* + END diff --git a/lapack-netlib/SRC/dlarmm.c b/lapack-netlib/SRC/dlarmm.c new file mode 100644 index 000000000..eec5d143a --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.c @@ -0,0 +1,605 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLARMM */ + +/* Definition: */ +/* =========== */ + +/* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) */ + +/* DOUBLE PRECISION ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is DOUBLE PRECISION */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is DOUBLE PRECISION */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +doublereal dlarmm_(doublereal *anorm, doublereal *bnorm, doublereal *cnorm) +{ + /* System generated locals */ + doublereal ret_val; + + /* Local variables */ + extern doublereal dlamch_(char *); + doublereal bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = dlamch_("Safe minimum") / dlamch_("Precision"); + bignum = 1. / smlnum / 4.; + +/* Compute a scale factor. */ + + ret_val = 1.; + if (*bnorm <= 1.) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5 / *bnorm; + } + } + return ret_val; + +/* ==== End of DLARMM ==== */ + +} /* dlarmm_ */ + diff --git a/lapack-netlib/SRC/dlarmm.f b/lapack-netlib/SRC/dlarmm.f new file mode 100644 index 000000000..c36042009 --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c new file mode 100644 index 000000000..46eca6379 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.c @@ -0,0 +1,1265 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is DOUBLE PRECISION array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int dlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublereal *a, integer *lda, + doublereal *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *); + doublereal w[64]; + extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + integer *, doublereal *, integer *, doublereal *, doublereal *, + doublereal *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I+KK*LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + dlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = dlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + dscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = dlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + dscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + dscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of DLATRS3 */ + +} /* dlatrs3_ */ + diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f new file mode 100644 index 000000000..b4a98bc78 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE, DLARMM + EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLATRS, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks +* + NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I+KK*LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) +* + CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of DLATRS3 +* + END diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c new file mode 100644 index 000000000..9cfbe3dab --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.c @@ -0,0 +1,2060 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DTRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by DHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is DOUBLE PRECISION array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int dtrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, + integer *ldb, doublereal *c__, integer *ldc, doublereal *scale, + integer *iwork, integer *liwork, doublereal *swork, integer *ldswork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3; + + /* Local variables */ + doublereal scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *), dgemm_(char *, char *, integer *, integer *, integer * + , doublereal *, doublereal *, integer *, doublereal *, integer *, + doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(doublereal *); + integer l2, nb, pc, jj, ll; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int dtrsyl_(char *, char *, integer *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + dtrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0. && a[l + 1 + l * a_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0. && b[l + 1 + l * b_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = dlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = dlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + dgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = dlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__4, &i__5, &i__6, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = dlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + dscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + dgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = dlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = dlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__2, &i__3, &i__4, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + d__2 = scal, d__3 = (d__1 = c__[k + l * c_dim1], abs(d__1)); + scal = f2cmax(d__2,d__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + dlascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return 0; + +/* End of DTRSYL3 */ + +} /* dtrsyl3_ */ + diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f new file mode 100644 index 000000000..c44ec3808 --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index af2850398..a639e0375 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -469,6 +469,15 @@ ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ), + $ 80 ) + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN @@ -477,6 +486,12 @@ ELSE NB = 64 END IF + ELSE IF( C3.EQ.'TRS' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN IF( C3.EQ.'EBZ' ) THEN diff --git a/lapack-netlib/SRC/slarmm.c b/lapack-netlib/SRC/slarmm.c new file mode 100644 index 000000000..95114e2f1 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.c @@ -0,0 +1,605 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLARMM */ + +/* Definition: */ +/* =========== */ + +/* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) */ + +/* REAL ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is REAL */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is REAL */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +real slarmm_(real *anorm, real *bnorm, real *cnorm) +{ + /* System generated locals */ + real ret_val; + + /* Local variables */ + extern real slamch_(char *); + real bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = slamch_("Safe minimum") / slamch_("Precision"); + bignum = 1.f / smlnum / 4.f; + +/* Compute a scale factor. */ + + ret_val = 1.f; + if (*bnorm <= 1.f) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5f; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5f / *bnorm; + } + } + return ret_val; + +/* ==== End of SLARMM ==== */ + +} /* slarmm_ */ + diff --git a/lapack-netlib/SRC/slarmm.f b/lapack-netlib/SRC/slarmm.f new file mode 100644 index 000000000..643dd6748 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c new file mode 100644 index 000000000..e5c48a55b --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.c @@ -0,0 +1,1262 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is REAL array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int slatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, real *a, integer *lda, real *x, + integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern logical lsame_(char *, char *); + real rscal; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk, lscale; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + integer *, real *, integer *, real *, real *, real *, integer *); + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "SLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + slatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = slange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + sscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = slange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + sscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + sscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of SLATRS3 */ + +} /* slatrs3_ */ + diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f new file mode 100644 index 000000000..c3a08e524 --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL SLATRS, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of SLATRS3 +* + END diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c new file mode 100644 index 000000000..8ce30ed56 --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.c @@ -0,0 +1,2066 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b STRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > STRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by SHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is REAL array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int strsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, + real *c__, integer *ldc, real *scale, integer *iwork, integer *liwork, + real *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3; + + /* Local variables */ + real scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + real *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(real *); + integer l2, nb, pc, jj, ll; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *); + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + logical lquery; + extern /* Subroutine */ int strsyl_(char *, char *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, integer * + , real *, integer *); + integer nba, nbb; + real buf, sgn; + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "STRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } else if (! lquery && *liwork < iwork[1]) { + *info = -14; + } else if (! lquery && *ldswork < f2cmax(nba,nbb)) { + *info = -16; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("STRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return 0; + } + +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + strsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + + +/* REAL WNRM( MAX( M, N ) ) */ + wnrm=(real*)malloc (f2cmax(*m,*n)*sizeof(real)); + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0.f && a[l + 1 + l * a_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0.f && b[l + 1 + l * b_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = slange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = slange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + sgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = slange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__4, &i__5, &i__6, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = slange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + sscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + sgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = slange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = slange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__2, &i__3, &i__4, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + r__2 = scal, r__3 = (r__1 = c__[k + l * c_dim1], abs(r__1)); + scal = f2cmax(r__2,r__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + slascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return 0; + +/* End of STRSYL3 */ + +} /* strsyl3_ */ + diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f new file mode 100644 index 000000000..28762c2ed --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.f @@ -0,0 +1,1244 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END diff --git a/lapack-netlib/SRC/zlatrs3.c b/lapack-netlib/SRC/zlatrs3.c new file mode 100644 index 000000000..0cb1cda54 --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.c @@ -0,0 +1,1283 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX*16 A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX*16 array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int zlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublecomplex *a, integer *lda, + doublecomplex *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + doublereal w[64]; + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit; + extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + integer *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublereal *, integer *); + logical lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + zlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = zlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + zdscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = zlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + zdscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + zdscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__6, &i__7, &i__8, &z__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("T", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of ZLATRS3 */ + +} /* zlatrs3_ */ + diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f new file mode 100644 index 000000000..fc1be0517 --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.f @@ -0,0 +1,667 @@ +*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX*16 A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( LDX, * ) + DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE, DLARMM + EXTERNAL ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM +* .. +* .. External Subroutines .. + EXTERNAL ZLATRS, ZDSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1) * NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO + +* +* Reduce local scaling factors +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of ZLATRS3 +* + END diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c new file mode 100644 index 000000000..314b0f98d --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.c @@ -0,0 +1,2027 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX*16 array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complex16SYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int ztrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex + *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *scale, + doublereal *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3, d__4; + doublecomplex z__1; + + /* Local variables */ + doublereal scal; + doublecomplex csgn; + doublereal anrm, bnrm, cnrm; + integer awrk, bwrk; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(doublereal *); + integer nb, jj, ll; + extern doublereal dlamch_(char *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex * + , integer *, integer *); + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int ztrsyl_(char *, char *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ztrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + z__1.r = sgn, z__1.i = 0.; + csgn.r = z__1.r, csgn.i = z__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = zlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = zlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__4, &i__5, &i__6, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = zlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__4, &i__5, &i__6, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = zlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + zdscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__3, &i__4, &i__5, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = zlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = zlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__2, &i__3, &i__4, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + d__3 = (d__1 = c__[i__1].r, abs(d__1)), d__4 = (d__2 = d_imag(&c__[ + c_dim1 + 1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + d__3 = scal, d__4 = (d__1 = c__[i__3].r, abs(d__1)), d__3 = + f2cmax(d__3,d__4), d__4 = (d__2 = d_imag(&c__[k + l * + c_dim1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + zlascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return 0; + +/* End of ZTRSYL3 */ + +} /* ztrsyl3_ */ + diff --git a/lapack-netlib/SRC/ztrsyl3.f b/lapack-netlib/SRC/ztrsyl3.f new file mode 100644 index 000000000..b5a058da4 --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b ZTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> ZTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complex16SYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) + DOUBLE PRECISION SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX*16 CSGN +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLARMM, ZLANGE + EXTERNAL DLAMCH, DLARMM, ILAENV, LSAME, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DIMAG, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'ZTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) + CSGN = DCMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( DBLE( C( 1, 1 ) ) ), + $ ABS( DIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( DBLE ( C( K, L ) ) ), + $ ABS( DIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL ZLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of ZTRSYL3 +* + END diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 226004a90..d252c7fa9 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -40,7 +40,7 @@ set(SEIGTST schkee.F sget54.f sglmts.f sgqrts.f sgrqts.f sgsvts3.f shst01.f slarfy.f slarhs.f slatm4.f slctes.f slctsx.f slsets.f sort01.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f - sstt22.f ssyt21.f ssyt22.f) + sstt22.f ssyl01.f ssyt21.f ssyt22.f) set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f @@ -56,7 +56,7 @@ set(CEIGTST cchkee.F cget54.f cglmts.f cgqrts.f cgrqts.f cgsvts3.f chbt21.f chet21.f chet22.f chpt21.f chst01.f clarfy.f clarhs.f clatm4.f clctes.f clctsx.f clsets.f csbmv.f - csgt01.f cslect.f + csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f @@ -77,7 +77,7 @@ set(DEIGTST dchkee.F dget54.f dglmts.f dgqrts.f dgrqts.f dgsvts3.f dhst01.f dlarfy.f dlarhs.f dlatm4.f dlctes.f dlctsx.f dlsets.f dort01.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f - dstt22.f dsyt21.f dsyt22.f) + dstt22.f dsyl01.f dsyt21.f dsyt22.f) set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f @@ -93,7 +93,7 @@ set(ZEIGTST zchkee.F zget54.f zglmts.f zgqrts.f zgrqts.f zgsvts3.f zhbt21.f zhet21.f zhet22.f zhpt21.f zhst01.f zlarfy.f zlarhs.f zlatm4.f zlctes.f zlctsx.f zlsets.f zsbmv.f - zsgt01.f zslect.f + zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) macro(add_eig_executable name) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index bccfccf95..942ae6982 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -62,7 +62,7 @@ SEIGTST = schkee.o \ sget54.o sglmts.o sgqrts.o sgrqts.o sgsvts3.o \ shst01.o slarfy.o slarhs.o slatm4.o slctes.o slctsx.o slsets.o sort01.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ - sstt22.o ssyt21.o ssyt22.o + sstt22.o ssyl01.o ssyt21.o ssyt22.o CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ @@ -78,7 +78,7 @@ CEIGTST = cchkee.o \ cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts3.o \ chbt21.o chet21.o chet22.o chpt21.o chst01.o \ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ - csgt01.o cslect.o \ + csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ @@ -99,7 +99,7 @@ DEIGTST = dchkee.o \ dget54.o dglmts.o dgqrts.o dgrqts.o dgsvts3.o \ dhst01.o dlarfy.o dlarhs.o dlatm4.o dlctes.o dlctsx.o dlsets.o dort01.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ - dstt22.o dsyt21.o dsyt22.o + dstt22.o dsyl01.o dsyt21.o dsyt22.o ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ @@ -115,7 +115,7 @@ ZEIGTST = zchkee.o \ zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts3.o \ zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ - zsgt01.o zslect.o \ + zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o .PHONY: all diff --git a/lapack-netlib/TESTING/EIG/cchkec.f b/lapack-netlib/TESTING/EIG/cchkec.f index 6727a0954..c892b0a54 100644 --- a/lapack-netlib/TESTING/EIG/cchkec.f +++ b/lapack-netlib/TESTING/EIG/cchkec.f @@ -23,7 +23,7 @@ *> \verbatim *> *> CCHKEC tests eigen- condition estimation routines -*> CTRSYL, CTREXC, CTRSNA, CTRSEN +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA, CTRSEN *> *> In all cases, the routine runs through a fixed set of numerical *> examples, subjects them to various tests, and compares the test @@ -88,17 +88,17 @@ * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - REAL EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + REAL EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38 + EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38, CSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -120,10 +120,24 @@ $ CALL CERREC( PATH, NOUT ) * OK = .TRUE. - CALL CGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL CGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL CSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL CGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -169,6 +183,12 @@ $ / ' Safe minimum (SFMIN) = ', E16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9972 FORMAT( 'CTRSYL and CTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in CTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in CTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) RETURN * * End of CCHKEC diff --git a/lapack-netlib/TESTING/EIG/cerrec.f b/lapack-netlib/TESTING/EIG/cerrec.f index 650ab2b6e..6e2e1d38a 100644 --- a/lapack-netlib/TESTING/EIG/cerrec.f +++ b/lapack-netlib/TESTING/EIG/cerrec.f @@ -23,7 +23,7 @@ *> *> CERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> CTRSYL, CTREXC, CTRSNA and CTRSEN. +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA and CTRSEN. *> \endverbatim * * Arguments: @@ -77,12 +77,12 @@ * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - REAL RW( LW ), S( NMAX ), SEP( NMAX ) + REAL RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL + EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL, CTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'CTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test CTRSYL3 +* + SRNAMT = 'CTRSYL3' + INFOT = 1 + CALL CTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test CTREXC * SRNAMT = 'CTREXC' diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f new file mode 100644 index 000000000..e21f1a7a0 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -0,0 +1,294 @@ +*> \brief \b CSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CSYL01 tests CTRSYL and CTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements CGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual CTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual CTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times CTRSYL3 and CTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of CTRSYL +*> RMAX(2) = Value of the largest test ratio of CTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times CTRSYL where INFO is nonzero +*> NINFO(2) = No. of times CTRSYL3 where INFO is nonzero +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX RMUL +* .. +* .. Local Arrays .. + COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MIN( MAXM, MAXN ) ) + REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, CLANGE + EXTERNAL SISNAN, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CLATMR, CLACPY, CGEMM, CTRSYL, CTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.5E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 23 + KLA = 0 + KUA = M - 1 + CALL CLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = CLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 29 + KLB = 0 + KUB = N - 1 + CALL CLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = CLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL CLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL CTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of CSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/dchkec.f b/lapack-netlib/TESTING/EIG/dchkec.f index 854961884..c4451a627 100644 --- a/lapack-netlib/TESTING/EIG/dchkec.f +++ b/lapack-netlib/TESTING/EIG/dchkec.f @@ -90,21 +90,23 @@ LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC DOUBLE PRECISION EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL DERREC, DGET31, DGET32, DGET33, DGET34, DGET35, - $ DGET36, DGET37, DGET38, DGET39, DGET40 + $ DGET36, DGET37, DGET38, DGET39, DGET40, DSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -153,10 +155,24 @@ WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL DGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL DGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL DSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL DGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in DTGEXC: RMAX =', D12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'DTRSYL and DTRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in DTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in DTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of DCHKEC * diff --git a/lapack-netlib/TESTING/EIG/derrec.f b/lapack-netlib/TESTING/EIG/derrec.f index d5863ad42..f11f48887 100644 --- a/lapack-netlib/TESTING/EIG/derrec.f +++ b/lapack-netlib/TESTING/EIG/derrec.f @@ -23,7 +23,7 @@ *> *> DERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> DTRSYL, DTREXC, DTRSNA and DTRSEN. +*> DTRSYL, DTRSYL3, DTREXC, DTRSNA and DTRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL + EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL, DTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'DTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test DTRSYL3 +* + SRNAMT = 'DTRSYL3' + INFOT = 1 + CALL DTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test DTREXC * SRNAMT = 'DTREXC' diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f new file mode 100644 index 000000000..782d2cd42 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dsyl01.f @@ -0,0 +1,288 @@ +*> \brief \b DSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYL01 tests DTRSYL and DTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements DGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual DTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual DTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times DTRSYL3 and DTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION, dimension (2) +*> RMAX(1) = Value of the largest test ratio of DTRSYL +*> RMAX(2) = Value of the largest test ratio of DTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times DTRSYL returns an expected INFO +*> NINFO(2) = No. of times DTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 245, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 126 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DLATMR, DLACPY, DGEMM, DTRSYL, DTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.000001D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL DLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = DLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL DLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = DLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL DLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL DTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of DSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/schkec.f b/lapack-netlib/TESTING/EIG/schkec.f index e6123e1ad..59abb2466 100644 --- a/lapack-netlib/TESTING/EIG/schkec.f +++ b/lapack-netlib/TESTING/EIG/schkec.f @@ -90,21 +90,23 @@ LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC REAL EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL SERREC, SGET31, SGET32, SGET33, SGET34, SGET35, - $ SGET36, SGET37, SGET38, SGET39, SGET40 + $ SGET36, SGET37, SGET38, SGET39, SGET40, SSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -153,10 +155,24 @@ WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL SGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL SGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL SSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL SGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in STGEXC: RMAX =', E12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'STRSYL and STRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in STRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in STRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of SCHKEC * diff --git a/lapack-netlib/TESTING/EIG/serrec.f b/lapack-netlib/TESTING/EIG/serrec.f index 249f0e642..9a7ceb362 100644 --- a/lapack-netlib/TESTING/EIG/serrec.f +++ b/lapack-netlib/TESTING/EIG/serrec.f @@ -23,7 +23,7 @@ *> *> SERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> STRSYL, STREXC, STRSNA and STRSEN. +*> STRSYL, STRSYL3, STREXC, STRSNA and STRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL + EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL, STRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'STRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test STRSYL3 +* + SRNAMT = 'STRSYL3' + INFOT = 1 + CALL STRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test STREXC * SRNAMT = 'STREXC' diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f new file mode 100644 index 000000000..22d089dc8 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/ssyl01.f @@ -0,0 +1,288 @@ +*> \brief \b SSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYL01 tests STRSYL and STRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements SGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual STRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual STRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times STRSYL3 and STRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is REAL, dimension (2) +*> RMAX(1) = Value of the largest test ratio of STRSYL +*> RMAX(2) = Value of the largest test ratio of STRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times STRSYL returns an expected INFO +*> NINFO(2) = No. of times STRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + REAL A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 54 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, SLANGE + EXTERNAL SISNAN, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SLATMR, SLACPY, SGEMM, STRSYL, STRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.05E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL SLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = SLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL SLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = SLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL SLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL STRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ C, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, C, MAXM ) + RES1 = SLANGE( 'M', M, N, C, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL STRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = SLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of SSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/zchkec.f b/lapack-netlib/TESTING/EIG/zchkec.f index 1e1c29e0d..62a76d357 100644 --- a/lapack-netlib/TESTING/EIG/zchkec.f +++ b/lapack-netlib/TESTING/EIG/zchkec.f @@ -88,17 +88,17 @@ * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - DOUBLE PRECISION EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + DOUBLE PRECISION EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38 + EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38, ZSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -120,10 +120,24 @@ $ CALL ZERREC( PATH, NOUT ) * OK = .TRUE. - CALL ZGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL ZGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL ZSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL ZGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -148,7 +162,7 @@ WRITE( NOUT, FMT = 9996 )RTRSEN, LTRSEN, NTRSEN, KTRSEN END IF * - NTESTS = KTRSYL + KTREXC + KTRSNA + KTRSEN + NTESTS = KTRSYL + KTRSYL3 + KTREXC + KTRSNA + KTRSEN IF( OK ) $ WRITE( NOUT, FMT = 9995 )PATH, NTESTS * @@ -169,6 +183,12 @@ $ / ' Safe minimum (SFMIN) = ', D16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9970 FORMAT( 'Error in ZTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9971 FORMAT( 'Error in ZTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9972 FORMAT( 'ZTRSYL and ZTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') RETURN * * End of ZCHKEC diff --git a/lapack-netlib/TESTING/EIG/zerrec.f b/lapack-netlib/TESTING/EIG/zerrec.f index dc6129da9..e1938f57d 100644 --- a/lapack-netlib/TESTING/EIG/zerrec.f +++ b/lapack-netlib/TESTING/EIG/zerrec.f @@ -23,7 +23,7 @@ *> *> ZERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> ZTRSYL, ZTREXC, ZTRSNA and ZTRSEN. +*> ZTRSYL, ZTRSYL3, ZTREXC, ZTRSNA and ZTRSEN. *> \endverbatim * * Arguments: @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ) + DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX*16 A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. @@ -141,6 +141,43 @@ CALL CHKXER( 'ZTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test ZTRSYL3 +* + SRNAMT = 'ZTRSYL3' + INFOT = 1 + CALL ZTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test ZTREXC * SRNAMT = 'ZTREXC' diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f new file mode 100644 index 000000000..1e8619a34 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -0,0 +1,294 @@ +*> \brief \b ZSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZSYL01 tests ZTRSYL and ZTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements ZGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual ZTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual ZTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times ZTRSYL3 and ZTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of ZTRSYL +*> RMAX(2) = Value of the largest test ratio of ZTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times ZTRSYL returns an expected INFO +*> NINFO(2) = No. of times ZTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D+0 ) ) + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 185, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX*16 RMUL +* .. +* .. Local Arrays .. + COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MIN( MAXM, MAXN ) ) + DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL DISNAN, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZLATMR, ZLACPY, ZGEMM, ZTRSYL, ZTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.05D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 51 + KLA = 0 + KUA = M - 1 + CALL ZLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = ZLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL ZLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = ZLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL ZLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL ZTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of ZSYL01 +* + END diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index ce1ecf761..c55b07643 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS +*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -210,9 +210,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04, - $ CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS, - $ CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI, - $ CTRTRS, XLAENV + $ CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR, + $ CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03, + $ CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Complex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = REAL( A( 1 ) ) * CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B. +* + SRNAMT = 'CLATRS3' + CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B, 1 ) + CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from CLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL CSSCAL( N, BIGNUM, X, 1 ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index db65edd88..9ba784f62 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -82,9 +82,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, CTBCON, - $ CTBRFS, CTBTRS, CTPCON, CTPRFS, CTPTRI, CTPTRS, - $ CTRCON, CTRRFS, CTRTI2, CTRTRI, CTRTRS + EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, + $ CLATRS3, CTBCON, CTBRFS, CTBTRS, CTPCON, + $ CTPRFS, CTPTRI, CTPTRS, CTRCON, CTRRFS, CTRTI2, + $ CTRTRI, CTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ CALL CLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'CLATRS', INFOT, NOUT, LERR, OK ) * +* CLATRS3 +* + SRNAMT = 'CLATRS3' + INFOT = 1 + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/dchktr.f b/lapack-netlib/TESTING/LIN/dchktr.f index a4a1150c0..57e87326b 100644 --- a/lapack-netlib/TESTING/LIN/dchktr.f +++ b/lapack-netlib/TESTING/LIN/dchktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS +*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -198,13 +198,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DLAMCH, DUMMY, RCOND, + $ RCONDC, RCONDI, RCONDO, RES, SCALE * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DCOPY, DERRTR, DGET04, - $ DLACPY, DLARHS, DLATRS, DLATTR, DTRCON, DTRRFS, - $ DTRT01, DTRT02, DTRT03, DTRT05, DTRT06, DTRTRI, - $ DTRTRS, XLAENV + $ DLACPY, DLAMCH, DSCAL, DLARHS, DLATRS, DLATRS3, + $ DLATTR, DTRCON, DTRRFS, DTRT01, DTRT02, DTRT03, + $ DTRT05, DTRT06, DTRTRI, DTRTRS, XLAENV * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ * PATH( 1: 1 ) = 'Double precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'DLATRS3' + CALL DCOPY( N, X, 1, B, 1 ) + CALL DCOPY( N, X, 1, B( N+1 ), 1 ) + CALL DSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL DLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from DLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL DSCAL( N, BIGNUM, X, 1 ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +583,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'DLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +603,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index a667f0d2b..d0580497d 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -83,9 +83,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, DTBCON, - $ DTBRFS, DTBTRS, DTPCON, DTPRFS, DTPTRI, DTPTRS, - $ DTRCON, DTRRFS, DTRTI2, DTRTRI, DTRTRS + EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, + $ DLATRS3, DTBCON, DTBRFS, DTBTRS, DTPCON, + $ DTPRFS, DTPTRI, DTPTRS, DTRCON, DTRRFS, + $ DTRTI2, DTRTRI, DTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ INFOT = 7 CALL DLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'DLATRS', INFOT, NOUT, LERR, OK ) +* +* DLATRS3 +* + SRNAMT = 'DLATRS3' + INFOT = 1 + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f index 66fa0bee7..5aeb1ce88 100644 --- a/lapack-netlib/TESTING/LIN/schktr.f +++ b/lapack-netlib/TESTING/LIN/schktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS +*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -198,13 +198,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, SCOPY, SERRTR, SGET04, - $ SLACPY, SLARHS, SLATRS, SLATTR, STRCON, STRRFS, - $ STRT01, STRT02, STRT03, STRT05, STRT06, STRTRI, - $ STRTRS, XLAENV + $ SLACPY, SLARHS, SLATRS, SLATRS3, SLATTR, SSCAL, + $ STRCON, STRRFS, STRT01, STRT02, STRT03, STRT05, + $ STRT06, STRTRI, STRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ * PATH( 1: 1 ) = 'Single precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,33 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'SLATRS3' + CALL SCOPY( N, X, 1, B, 1 ) + CALL SCOPY( N, X, 1, B( N+1 ), 1 ) + CALL SSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL SLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from SLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) +* + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3 ( 1 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL SSCAL( N, BIGNUM, X, 1 ) + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +584,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'SLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +604,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index f0d0a0ef2..af1ce0a8e 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -83,9 +83,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, STBCON, - $ STBRFS, STBTRS, STPCON, STPRFS, STPTRI, STPTRS, - $ STRCON, STRRFS, STRTI2, STRTRI, STRTRS + EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, + $ SLATRS3, STBCON, STBRFS, STBTRS, STPCON, + $ STPRFS, STPTRI, STPTRS, STRCON, STRRFS, STRTI2, + $ STRTRI, STRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ INFOT = 7 CALL SLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'SLATRS', INFOT, NOUT, LERR, OK ) +* +* SLATRS3 +* + SRNAMT = 'SLATRS3' + INFOT = 1 + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 0a6f47b1e..275ca2857 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS +*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, DLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -209,10 +209,10 @@ EXTERNAL LSAME, ZLANTR * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR, - $ ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON, - $ ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06, - $ ZTRTRI, ZTRTRS + EXTERNAL ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY, + $ ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS, + $ ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01, + $ ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Zomplex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = DBLE( A( 1 ) ) * CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'ZLATRS3' + CALL ZCOPY( N, X, 1, B, 1 ) + CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) + CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from ZLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL ZDSCAL( N, BIGNUM, X, 1 ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -565,8 +599,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 098040ace..211b92154 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -82,9 +82,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, ZTBCON, - $ ZTBRFS, ZTBTRS, ZTPCON, ZTPRFS, ZTPTRI, ZTPTRS, - $ ZTRCON, ZTRRFS, ZTRTI2, ZTRTRI, ZTRTRS + EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, + $ ZLATRS3, ZTBCON, ZTBRFS, ZTBTRS, ZTPCON, + $ ZTPRFS, ZTPTRI, ZTPTRS, ZTRCON, ZTRRFS, ZTRTI2, + $ ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ CALL ZLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'ZLATRS', INFOT, NOUT, LERR, OK ) * +* ZLATRS3 +* + SRNAMT = 'ZLATRS3' + INFOT = 1 + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN