Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651)
This commit is contained in:
parent
7eb2653268
commit
6eb707d941
|
@ -0,0 +1,666 @@
|
|||
*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
|
||||
*
|
||||
* Definition:
|
||||
* ===========
|
||||
*
|
||||
* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
* X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
* CHARACTER DIAG, NORMIN, TRANS, UPLO
|
||||
* INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
* REAL CNORM( * ), SCALE( * ), WORK( * )
|
||||
* COMPLEX A( LDA, * ), X( LDX, * )
|
||||
* ..
|
||||
*
|
||||
*
|
||||
*> \par Purpose:
|
||||
* =============
|
||||
*>
|
||||
*> \verbatim
|
||||
*>
|
||||
*> CLATRS3 solves one of the triangular systems
|
||||
*>
|
||||
*> A * X = B * diag(scale), A**T * X = B * diag(scale), or
|
||||
*> A**H * X = B * diag(scale)
|
||||
*>
|
||||
*> with scaling to prevent overflow. Here A is an upper or lower
|
||||
*> triangular matrix, A**T denotes the transpose of A, A**H denotes the
|
||||
*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
|
||||
*> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
|
||||
*> is usually less than or equal to 1, chosen such that X(:,j) is less
|
||||
*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
|
||||
*> for some j), then a non-trivial solution to A*X = 0 is returned. If
|
||||
*> the system is so badly scaled that the solution cannot be represented
|
||||
*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
|
||||
*>
|
||||
*> This is a BLAS-3 version of LATRS for solving several right
|
||||
*> hand sides simultaneously.
|
||||
*>
|
||||
*> \endverbatim
|
||||
*
|
||||
* Arguments:
|
||||
* ==========
|
||||
*
|
||||
*> \param[in] UPLO
|
||||
*> \verbatim
|
||||
*> UPLO is CHARACTER*1
|
||||
*> Specifies whether the matrix A is upper or lower triangular.
|
||||
*> = 'U': Upper triangular
|
||||
*> = 'L': Lower triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] TRANS
|
||||
*> \verbatim
|
||||
*> TRANS is CHARACTER*1
|
||||
*> Specifies the operation applied to A.
|
||||
*> = 'N': Solve A * x = s*b (No transpose)
|
||||
*> = 'T': Solve A**T* x = s*b (Transpose)
|
||||
*> = 'C': Solve A**T* x = s*b (Conjugate transpose)
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] DIAG
|
||||
*> \verbatim
|
||||
*> DIAG is CHARACTER*1
|
||||
*> Specifies whether or not the matrix A is unit triangular.
|
||||
*> = 'N': Non-unit triangular
|
||||
*> = 'U': Unit triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NORMIN
|
||||
*> \verbatim
|
||||
*> NORMIN is CHARACTER*1
|
||||
*> Specifies whether CNORM has been set or not.
|
||||
*> = 'Y': CNORM contains the column norms on entry
|
||||
*> = 'N': CNORM is not set on entry. On exit, the norms will
|
||||
*> be computed and stored in CNORM.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] N
|
||||
*> \verbatim
|
||||
*> N is INTEGER
|
||||
*> The order of the matrix A. N >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NRHS
|
||||
*> \verbatim
|
||||
*> NRHS is INTEGER
|
||||
*> The number of columns of X. NRHS >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] A
|
||||
*> \verbatim
|
||||
*> A is COMPLEX array, dimension (LDA,N)
|
||||
*> The triangular matrix A. If UPLO = 'U', the leading n by n
|
||||
*> upper triangular part of the array A contains the upper
|
||||
*> triangular matrix, and the strictly lower triangular part of
|
||||
*> A is not referenced. If UPLO = 'L', the leading n by n lower
|
||||
*> triangular part of the array A contains the lower triangular
|
||||
*> matrix, and the strictly upper triangular part of A is not
|
||||
*> referenced. If DIAG = 'U', the diagonal elements of A are
|
||||
*> also not referenced and are assumed to be 1.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDA
|
||||
*> \verbatim
|
||||
*> LDA is INTEGER
|
||||
*> The leading dimension of the array A. LDA >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] X
|
||||
*> \verbatim
|
||||
*> X is COMPLEX array, dimension (LDX,NRHS)
|
||||
*> On entry, the right hand side B of the triangular system.
|
||||
*> On exit, X is overwritten by the solution matrix X.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDX
|
||||
*> \verbatim
|
||||
*> LDX is INTEGER
|
||||
*> The leading dimension of the array X. LDX >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] SCALE
|
||||
*> \verbatim
|
||||
*> SCALE is REAL array, dimension (NRHS)
|
||||
*> The scaling factor s(k) is for the triangular system
|
||||
*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k).
|
||||
*> If SCALE = 0, the matrix A is singular or badly scaled.
|
||||
*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
|
||||
*> that is an exact or approximate solution to A*x(:,k) = 0
|
||||
*> is returned. If the system so badly scaled that solution
|
||||
*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
|
||||
*> is returned.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] CNORM
|
||||
*> \verbatim
|
||||
*> CNORM is REAL array, dimension (N)
|
||||
*>
|
||||
*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
|
||||
*> contains the norm of the off-diagonal part of the j-th column
|
||||
*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal
|
||||
*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
|
||||
*> must be greater than or equal to the 1-norm.
|
||||
*>
|
||||
*> If NORMIN = 'N', CNORM is an output argument and CNORM(j)
|
||||
*> returns the 1-norm of the offdiagonal part of the j-th column
|
||||
*> of A.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] WORK
|
||||
*> \verbatim
|
||||
*> WORK is REAL array, dimension (LWORK).
|
||||
*> On exit, if INFO = 0, WORK(1) returns the optimal size of
|
||||
*> WORK.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LWORK
|
||||
*> LWORK is INTEGER
|
||||
*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
|
||||
*> NBA = (N + NB - 1)/NB and NB is the optimal block size.
|
||||
*>
|
||||
*> If LWORK = -1, then a workspace query is assumed; the routine
|
||||
*> only calculates the optimal dimensions of the WORK array, returns
|
||||
*> this value as the first entry of the WORK array, and no error
|
||||
*> message related to LWORK is issued by XERBLA.
|
||||
*>
|
||||
*> \param[out] INFO
|
||||
*> \verbatim
|
||||
*> INFO is INTEGER
|
||||
*> = 0: successful exit
|
||||
*> < 0: if INFO = -k, the k-th argument had an illegal value
|
||||
*> \endverbatim
|
||||
*
|
||||
* Authors:
|
||||
* ========
|
||||
*
|
||||
*> \author Univ. of Tennessee
|
||||
*> \author Univ. of California Berkeley
|
||||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \ingroup doubleOTHERauxiliary
|
||||
*> \par Further Details:
|
||||
* =====================
|
||||
* \verbatim
|
||||
* The algorithm follows the structure of a block triangular solve.
|
||||
* The diagonal block is solved with a call to the robust the triangular
|
||||
* solver LATRS for every right-hand side RHS = 1, ..., NRHS
|
||||
* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
|
||||
* where op( A ) = A or op( A ) = A**T or op( A ) = A**H.
|
||||
* The linear block updates operate on block columns of X,
|
||||
* B( I, K ) - op(A( I, J )) * X( J, K )
|
||||
* and use GEMM. To avoid overflow in the linear block update, the worst case
|
||||
* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
|
||||
* such that
|
||||
* || s * B( I, RHS )||_oo
|
||||
* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold
|
||||
*
|
||||
* Once all columns of a block column have been rescaled (BLAS-1), the linear
|
||||
* update is executed with GEMM without overflow.
|
||||
*
|
||||
* To limit rescaling, local scale factors track the scaling of column segments.
|
||||
* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
|
||||
* per right-hand side column RHS = 1, ..., NRHS. The global scale factor
|
||||
* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
|
||||
* I = 1, ..., NBA.
|
||||
* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
|
||||
* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
|
||||
* linear update of potentially inconsistently scaled vector segments
|
||||
* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
|
||||
* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
|
||||
* if necessary, rescales the blocks prior to calling GEMM.
|
||||
*
|
||||
* \endverbatim
|
||||
* =====================================================================
|
||||
* References:
|
||||
* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
|
||||
* Parallel robust solution of triangular linear systems. Concurrency
|
||||
* and Computation: Practice and Experience, 31(19), e5064.
|
||||
*
|
||||
* Contributor:
|
||||
* Angelika Schwarz, Umea University, Sweden.
|
||||
*
|
||||
* =====================================================================
|
||||
SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
$ X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
IMPLICIT NONE
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
CHARACTER DIAG, TRANS, NORMIN, UPLO
|
||||
INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX A( LDA, * ), X( LDX, * )
|
||||
REAL CNORM( * ), SCALE( * ), WORK( * )
|
||||
* ..
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
REAL ZERO, ONE
|
||||
PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 )
|
||||
COMPLEX CZERO, CONE
|
||||
PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) )
|
||||
PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) )
|
||||
INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN
|
||||
PARAMETER ( NRHSMIN = 2, NBRHS = 32 )
|
||||
PARAMETER ( NBMIN = 8, NBMAX = 64 )
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
REAL W( NBMAX ), XNRM( NBRHS )
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER
|
||||
INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
|
||||
$ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
|
||||
$ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
|
||||
REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
|
||||
$ SCAMIN, SMLNUM, TMAX
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
REAL SLAMCH, CLANGE, SLARMM
|
||||
EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL CLATRS, CSSCAL, XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC ABS, MAX, MIN
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
NOTRAN = LSAME( TRANS, 'N' )
|
||||
NOUNIT = LSAME( DIAG, 'N' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
*
|
||||
* Partition A and X into blocks.
|
||||
*
|
||||
NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) )
|
||||
NB = MIN( NBMAX, NB )
|
||||
NBA = MAX( 1, (N + NB - 1) / NB )
|
||||
NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
|
||||
*
|
||||
* Compute the workspace
|
||||
*
|
||||
* The workspace comprises two parts.
|
||||
* The first part stores the local scale factors. Each simultaneously
|
||||
* computed right-hand side requires one local scale factor per block
|
||||
* row. WORK( I + KK * LDS ) is the scale factor of the vector
|
||||
* segment associated with the I-th block row and the KK-th vector
|
||||
* in the block column.
|
||||
LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
|
||||
LDS = NBA
|
||||
* The second part stores upper bounds of the triangular A. There are
|
||||
* a total of NBA x NBA blocks, of which only the upper triangular
|
||||
* part or the lower triangular part is referenced. The upper bound of
|
||||
* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
|
||||
LANRM = NBA * NBA
|
||||
AWRK = LSCALE
|
||||
WORK( 1 ) = LSCALE + LANRM
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
|
||||
$ LSAME( TRANS, 'C' ) ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
|
||||
INFO = -3
|
||||
ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
|
||||
$ LSAME( NORMIN, 'N' ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
INFO = -5
|
||||
ELSE IF( NRHS.LT.0 ) THEN
|
||||
INFO = -6
|
||||
ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -8
|
||||
ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -10
|
||||
ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
|
||||
INFO = -14
|
||||
END IF
|
||||
IF( INFO.NE.0 ) THEN
|
||||
CALL XERBLA( 'CLATRS3', -INFO )
|
||||
RETURN
|
||||
ELSE IF( LQUERY ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Initialize scaling factors
|
||||
*
|
||||
DO KK = 1, NRHS
|
||||
SCALE( KK ) = ONE
|
||||
END DO
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( MIN( N, NRHS ).EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Determine machine dependent constant to control overflow.
|
||||
*
|
||||
BIGNUM = SLAMCH( 'Overflow' )
|
||||
SMLNUM = SLAMCH( 'Safe Minimum' )
|
||||
*
|
||||
* Use unblocked code for small problems
|
||||
*
|
||||
IF( NRHS.LT.NRHSMIN ) THEN
|
||||
CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ),
|
||||
$ SCALE( 1 ), CNORM, INFO )
|
||||
DO K = 2, NRHS
|
||||
CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Compute norms of blocks of A excluding diagonal blocks and find
|
||||
* the block with the largest norm TMAX.
|
||||
*
|
||||
TMAX = ZERO
|
||||
DO J = 1, NBA
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
IF ( UPPER ) THEN
|
||||
IFIRST = 1
|
||||
ILAST = J - 1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
END IF
|
||||
DO I = IFIRST, ILAST
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Compute upper bound of A( I1:I2-1, J1:J2-1 ).
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + I+(J-1)*NBA ) = ANRM
|
||||
ELSE
|
||||
ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + J+(I-1)*NBA ) = ANRM
|
||||
END IF
|
||||
TMAX = MAX( TMAX, ANRM )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN
|
||||
*
|
||||
* Some matrix entries have huge absolute value. At least one upper
|
||||
* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
|
||||
* number, either due to overflow in LANGE or due to Inf in A.
|
||||
* Fall back to LATRS. Set normin = 'N' for every right-hand side to
|
||||
* force computation of TSCAL in LATRS to avoid the likely overflow
|
||||
* in the computation of the column norms CNORM.
|
||||
*
|
||||
DO K = 1, NRHS
|
||||
CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Every right-hand side requires workspace to store NBA local scale
|
||||
* factors. To save workspace, X is computed successively in block columns
|
||||
* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
|
||||
* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
|
||||
DO K = 1, NBX
|
||||
* Loop over block columns (index = K) of X and, for column-wise scalings,
|
||||
* over individual columns (index = KK).
|
||||
* K1: column index of the first column in X( J, K )
|
||||
* K2: column index of the first column in X( J, K+1 )
|
||||
* so the K2 - K1 is the column count of the block X( J, K )
|
||||
K1 = (K-1)*NBRHS + 1
|
||||
K2 = MIN( K*NBRHS, NRHS ) + 1
|
||||
*
|
||||
* Initialize local scaling factors of current block column X( J, K )
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
DO I = 1, NBA
|
||||
WORK( I+KK*LDS ) = ONE
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
ELSE
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
* where op(A) = A**T or op(A) = A**H
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
ELSE
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
END IF
|
||||
END IF
|
||||
|
||||
DO J = JFIRST, JLAST, JINC
|
||||
* J1: row index of the first row in A( J, J )
|
||||
* J2: row index of the first row in A( J+1, J+1 )
|
||||
* so that J2 - J1 is the row count of the block A( J, J )
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
*
|
||||
* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( KK.EQ.1 ) THEN
|
||||
CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
ELSE
|
||||
CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
END IF
|
||||
* Find largest absolute value entry in the vector segment
|
||||
* X( J1:J2-1, RHS ) as an upper bound for the worst case
|
||||
* growth in the linear updates.
|
||||
XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ),
|
||||
$ LDX, W )
|
||||
*
|
||||
IF( SCALOC .EQ. ZERO ) THEN
|
||||
* LATRS found that A is singular through A(j,j) = 0.
|
||||
* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
|
||||
* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
|
||||
* set by LATRS.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, J1-1
|
||||
X( II, KK ) = CZERO
|
||||
END DO
|
||||
DO II = J2, N
|
||||
X( II, KK ) = CZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
|
||||
* LATRS computed a valid scale factor, but combined with
|
||||
* the current scaling the solution does not have a
|
||||
* scale factor > 0.
|
||||
*
|
||||
* Set WORK( J+KK*LDS ) to smallest valid scale
|
||||
* factor and increase SCALOC accordingly.
|
||||
SCAL = WORK( J+KK*LDS ) / SMLNUM
|
||||
SCALOC = SCALOC * SCAL
|
||||
WORK( J+KK*LDS ) = SMLNUM
|
||||
* If LATRS overestimated the growth, x may be
|
||||
* rescaled to preserve a valid combined scale
|
||||
* factor WORK( J, KK ) > 0.
|
||||
RSCAL = ONE / SCALOC
|
||||
IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
|
||||
XNRM( KK ) = XNRM( KK ) * RSCAL
|
||||
CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
|
||||
SCALOC = ONE
|
||||
ELSE
|
||||
* The system op(A) * x = b is badly scaled and its
|
||||
* solution cannot be represented as (1/scale) * x.
|
||||
* Set x to zero. This approach deviates from LATRS
|
||||
* where a completely meaningless non-zero vector
|
||||
* is returned that is not a solution to op(A) * x = b.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, N
|
||||
X( II, KK ) = CZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
END IF
|
||||
END IF
|
||||
SCALOC = SCALOC * WORK( J+KK*LDS )
|
||||
WORK( J+KK*LDS ) = SCALOC
|
||||
END DO
|
||||
*
|
||||
* Linear block updates
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
ELSE
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
DO I = IFIRST, ILAST, IINC
|
||||
* I1: row index of the first column in X( I, K )
|
||||
* I2: row index of the first column in X( I+1, K )
|
||||
* so the I2 - I1 is the row count of the block X( I, K )
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Prepare the linear update to be executed with GEMM.
|
||||
* For each column, compute a consistent scaling, a
|
||||
* scaling factor to survive the linear update, and
|
||||
* rescale the column segments, if necesssary. Then
|
||||
* the linear update is safely executed.
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
* Compute consistent scaling
|
||||
SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
|
||||
*
|
||||
* Compute scaling factor to survive the linear update
|
||||
* simulating consistent scaling.
|
||||
*
|
||||
BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
|
||||
BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
|
||||
XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) )
|
||||
ANRM = WORK( AWRK + I+(J-1)*NBA )
|
||||
SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM )
|
||||
*
|
||||
* Simultaneously apply the robust update factor and the
|
||||
* consistency scaling factor to X( I, KK ) and X( J, KK ).
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
WORK( I+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
|
||||
WORK( J+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
|
||||
*
|
||||
CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE,
|
||||
$ A( I1, J1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ CONE, X( I1, K1 ), LDX )
|
||||
ELSE IF( LSAME( TRANS, 'T' ) ) THEN
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
|
||||
*
|
||||
CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE,
|
||||
$ A( J1, I1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ CONE, X( I1, K1 ), LDX )
|
||||
ELSE
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
|
||||
*
|
||||
CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE,
|
||||
$ A( J1, I1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ CONE, X( I1, K1 ), LDX )
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Reduce local scaling factors
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
DO I = 1, NBA
|
||||
SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Realize consistent scaling
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
|
||||
DO I = 1, NBA
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
|
||||
IF( SCAL.NE.ONE )
|
||||
$ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
RETURN
|
||||
*
|
||||
* End of CLATRS3
|
||||
*
|
||||
END
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,99 @@
|
|||
*> \brief \b DLARMM
|
||||
*
|
||||
* Definition:
|
||||
* ===========
|
||||
*
|
||||
* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM )
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
* DOUBLE PRECISION ANORM, BNORM, CNORM
|
||||
* ..
|
||||
*
|
||||
*> \par Purpose:
|
||||
* =======
|
||||
*>
|
||||
*> \verbatim
|
||||
*>
|
||||
*> DLARMM returns a factor s in (0, 1] such that the linear updates
|
||||
*>
|
||||
*> (s * C) - A * (s * B) and (s * C) - (s * A) * B
|
||||
*>
|
||||
*> cannot overflow, where A, B, and C are matrices of conforming
|
||||
*> dimensions.
|
||||
*>
|
||||
*> This is an auxiliary routine so there is no argument checking.
|
||||
*> \endverbatim
|
||||
*
|
||||
* Arguments:
|
||||
* =========
|
||||
*
|
||||
*> \param[in] ANORM
|
||||
*> \verbatim
|
||||
*> ANORM is DOUBLE PRECISION
|
||||
*> The infinity norm of A. ANORM >= 0.
|
||||
*> The number of rows of the matrix A. M >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] BNORM
|
||||
*> \verbatim
|
||||
*> BNORM is DOUBLE PRECISION
|
||||
*> The infinity norm of B. BNORM >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] CNORM
|
||||
*> \verbatim
|
||||
*> CNORM is DOUBLE PRECISION
|
||||
*> The infinity norm of C. CNORM >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*>
|
||||
* =====================================================================
|
||||
*> References:
|
||||
*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for
|
||||
*> Robust Solution of Triangular Linear Systems. In: International
|
||||
*> Conference on Parallel Processing and Applied Mathematics, pages
|
||||
*> 68--78. Springer, 2017.
|
||||
*>
|
||||
*> \ingroup OTHERauxiliary
|
||||
* =====================================================================
|
||||
|
||||
DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM )
|
||||
IMPLICIT NONE
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE PRECISION ANORM, BNORM, CNORM
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ONE, HALF, FOUR
|
||||
PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 )
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION BIGNUM, SMLNUM
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
DOUBLE PRECISION DLAMCH
|
||||
EXTERNAL DLAMCH
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
*
|
||||
* Determine machine dependent parameters to control overflow.
|
||||
*
|
||||
SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
|
||||
BIGNUM = ( ONE / SMLNUM ) / FOUR
|
||||
*
|
||||
* Compute a scale factor.
|
||||
*
|
||||
DLARMM = ONE
|
||||
IF( BNORM .LE. ONE ) THEN
|
||||
IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN
|
||||
DLARMM = HALF
|
||||
END IF
|
||||
ELSE
|
||||
IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN
|
||||
DLARMM = HALF / BNORM
|
||||
END IF
|
||||
END IF
|
||||
RETURN
|
||||
*
|
||||
* ==== End of DLARMM ====
|
||||
*
|
||||
END
|
|
@ -0,0 +1,656 @@
|
|||
*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
|
||||
*
|
||||
* Definition:
|
||||
* ===========
|
||||
*
|
||||
* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
* X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
* CHARACTER DIAG, NORMIN, TRANS, UPLO
|
||||
* INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ),
|
||||
* WORK( * ), X( LDX, * )
|
||||
* ..
|
||||
*
|
||||
*
|
||||
*> \par Purpose:
|
||||
* =============
|
||||
*>
|
||||
*> \verbatim
|
||||
*>
|
||||
*> DLATRS3 solves one of the triangular systems
|
||||
*>
|
||||
*> A * X = B * diag(scale) or A**T * X = B * diag(scale)
|
||||
*>
|
||||
*> with scaling to prevent overflow. Here A is an upper or lower
|
||||
*> triangular matrix, A**T denotes the transpose of A. X and B are
|
||||
*> n by nrhs matrices and scale is an nrhs element vector of scaling
|
||||
*> factors. A scaling factor scale(j) is usually less than or equal
|
||||
*> to 1, chosen such that X(:,j) is less than the overflow threshold.
|
||||
*> If the matrix A is singular (A(j,j) = 0 for some j), then
|
||||
*> a non-trivial solution to A*X = 0 is returned. If the system is
|
||||
*> so badly scaled that the solution cannot be represented as
|
||||
*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
|
||||
*>
|
||||
*> This is a BLAS-3 version of LATRS for solving several right
|
||||
*> hand sides simultaneously.
|
||||
*>
|
||||
*> \endverbatim
|
||||
*
|
||||
* Arguments:
|
||||
* ==========
|
||||
*
|
||||
*> \param[in] UPLO
|
||||
*> \verbatim
|
||||
*> UPLO is CHARACTER*1
|
||||
*> Specifies whether the matrix A is upper or lower triangular.
|
||||
*> = 'U': Upper triangular
|
||||
*> = 'L': Lower triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] TRANS
|
||||
*> \verbatim
|
||||
*> TRANS is CHARACTER*1
|
||||
*> Specifies the operation applied to A.
|
||||
*> = 'N': Solve A * x = s*b (No transpose)
|
||||
*> = 'T': Solve A**T* x = s*b (Transpose)
|
||||
*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose)
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] DIAG
|
||||
*> \verbatim
|
||||
*> DIAG is CHARACTER*1
|
||||
*> Specifies whether or not the matrix A is unit triangular.
|
||||
*> = 'N': Non-unit triangular
|
||||
*> = 'U': Unit triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NORMIN
|
||||
*> \verbatim
|
||||
*> NORMIN is CHARACTER*1
|
||||
*> Specifies whether CNORM has been set or not.
|
||||
*> = 'Y': CNORM contains the column norms on entry
|
||||
*> = 'N': CNORM is not set on entry. On exit, the norms will
|
||||
*> be computed and stored in CNORM.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] N
|
||||
*> \verbatim
|
||||
*> N is INTEGER
|
||||
*> The order of the matrix A. N >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NRHS
|
||||
*> \verbatim
|
||||
*> NRHS is INTEGER
|
||||
*> The number of columns of X. NRHS >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] A
|
||||
*> \verbatim
|
||||
*> A is DOUBLE PRECISION array, dimension (LDA,N)
|
||||
*> The triangular matrix A. If UPLO = 'U', the leading n by n
|
||||
*> upper triangular part of the array A contains the upper
|
||||
*> triangular matrix, and the strictly lower triangular part of
|
||||
*> A is not referenced. If UPLO = 'L', the leading n by n lower
|
||||
*> triangular part of the array A contains the lower triangular
|
||||
*> matrix, and the strictly upper triangular part of A is not
|
||||
*> referenced. If DIAG = 'U', the diagonal elements of A are
|
||||
*> also not referenced and are assumed to be 1.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDA
|
||||
*> \verbatim
|
||||
*> LDA is INTEGER
|
||||
*> The leading dimension of the array A. LDA >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] X
|
||||
*> \verbatim
|
||||
*> X is DOUBLE PRECISION array, dimension (LDX,NRHS)
|
||||
*> On entry, the right hand side B of the triangular system.
|
||||
*> On exit, X is overwritten by the solution matrix X.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDX
|
||||
*> \verbatim
|
||||
*> LDX is INTEGER
|
||||
*> The leading dimension of the array X. LDX >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] SCALE
|
||||
*> \verbatim
|
||||
*> SCALE is DOUBLE PRECISION array, dimension (NRHS)
|
||||
*> The scaling factor s(k) is for the triangular system
|
||||
*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k).
|
||||
*> If SCALE = 0, the matrix A is singular or badly scaled.
|
||||
*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
|
||||
*> that is an exact or approximate solution to A*x(:,k) = 0
|
||||
*> is returned. If the system so badly scaled that solution
|
||||
*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
|
||||
*> is returned.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] CNORM
|
||||
*> \verbatim
|
||||
*> CNORM is DOUBLE PRECISION array, dimension (N)
|
||||
*>
|
||||
*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
|
||||
*> contains the norm of the off-diagonal part of the j-th column
|
||||
*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal
|
||||
*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
|
||||
*> must be greater than or equal to the 1-norm.
|
||||
*>
|
||||
*> If NORMIN = 'N', CNORM is an output argument and CNORM(j)
|
||||
*> returns the 1-norm of the offdiagonal part of the j-th column
|
||||
*> of A.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] WORK
|
||||
*> \verbatim
|
||||
*> WORK is DOUBLE PRECISION array, dimension (LWORK).
|
||||
*> On exit, if INFO = 0, WORK(1) returns the optimal size of
|
||||
*> WORK.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LWORK
|
||||
*> LWORK is INTEGER
|
||||
*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
|
||||
*> NBA = (N + NB - 1)/NB and NB is the optimal block size.
|
||||
*>
|
||||
*> If LWORK = -1, then a workspace query is assumed; the routine
|
||||
*> only calculates the optimal dimensions of the WORK array, returns
|
||||
*> this value as the first entry of the WORK array, and no error
|
||||
*> message related to LWORK is issued by XERBLA.
|
||||
*>
|
||||
*> \param[out] INFO
|
||||
*> \verbatim
|
||||
*> INFO is INTEGER
|
||||
*> = 0: successful exit
|
||||
*> < 0: if INFO = -k, the k-th argument had an illegal value
|
||||
*> \endverbatim
|
||||
*
|
||||
* Authors:
|
||||
* ========
|
||||
*
|
||||
*> \author Univ. of Tennessee
|
||||
*> \author Univ. of California Berkeley
|
||||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \ingroup doubleOTHERauxiliary
|
||||
*> \par Further Details:
|
||||
* =====================
|
||||
* \verbatim
|
||||
* The algorithm follows the structure of a block triangular solve.
|
||||
* The diagonal block is solved with a call to the robust the triangular
|
||||
* solver LATRS for every right-hand side RHS = 1, ..., NRHS
|
||||
* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
|
||||
* where op( A ) = A or op( A ) = A**T.
|
||||
* The linear block updates operate on block columns of X,
|
||||
* B( I, K ) - op(A( I, J )) * X( J, K )
|
||||
* and use GEMM. To avoid overflow in the linear block update, the worst case
|
||||
* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
|
||||
* such that
|
||||
* || s * B( I, RHS )||_oo
|
||||
* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold
|
||||
*
|
||||
* Once all columns of a block column have been rescaled (BLAS-1), the linear
|
||||
* update is executed with GEMM without overflow.
|
||||
*
|
||||
* To limit rescaling, local scale factors track the scaling of column segments.
|
||||
* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
|
||||
* per right-hand side column RHS = 1, ..., NRHS. The global scale factor
|
||||
* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
|
||||
* I = 1, ..., NBA.
|
||||
* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
|
||||
* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
|
||||
* linear update of potentially inconsistently scaled vector segments
|
||||
* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
|
||||
* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
|
||||
* if necessary, rescales the blocks prior to calling GEMM.
|
||||
*
|
||||
* \endverbatim
|
||||
* =====================================================================
|
||||
* References:
|
||||
* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
|
||||
* Parallel robust solution of triangular linear systems. Concurrency
|
||||
* and Computation: Practice and Experience, 31(19), e5064.
|
||||
*
|
||||
* Contributor:
|
||||
* Angelika Schwarz, Umea University, Sweden.
|
||||
*
|
||||
* =====================================================================
|
||||
SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
$ X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
IMPLICIT NONE
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
CHARACTER DIAG, TRANS, NORMIN, UPLO
|
||||
INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ),
|
||||
$ SCALE( * ), WORK( * )
|
||||
* ..
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ZERO, ONE
|
||||
PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
|
||||
INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN
|
||||
PARAMETER ( NRHSMIN = 2, NBRHS = 32 )
|
||||
PARAMETER ( NBMIN = 8, NBMAX = 64 )
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS )
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER
|
||||
INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
|
||||
$ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
|
||||
$ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
|
||||
DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
|
||||
$ SCAMIN, SMLNUM, TMAX
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
DOUBLE PRECISION DLAMCH, DLANGE, DLARMM
|
||||
EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL DLATRS, DSCAL, XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC ABS, MAX, MIN
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
NOTRAN = LSAME( TRANS, 'N' )
|
||||
NOUNIT = LSAME( DIAG, 'N' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
*
|
||||
* Partition A and X into blocks
|
||||
*
|
||||
NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) )
|
||||
NB = MIN( NBMAX, NB )
|
||||
NBA = MAX( 1, (N + NB - 1) / NB )
|
||||
NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
|
||||
*
|
||||
* Compute the workspace
|
||||
*
|
||||
* The workspace comprises two parts.
|
||||
* The first part stores the local scale factors. Each simultaneously
|
||||
* computed right-hand side requires one local scale factor per block
|
||||
* row. WORK( I+KK*LDS ) is the scale factor of the vector
|
||||
* segment associated with the I-th block row and the KK-th vector
|
||||
* in the block column.
|
||||
LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
|
||||
LDS = NBA
|
||||
* The second part stores upper bounds of the triangular A. There are
|
||||
* a total of NBA x NBA blocks, of which only the upper triangular
|
||||
* part or the lower triangular part is referenced. The upper bound of
|
||||
* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
|
||||
LANRM = NBA * NBA
|
||||
AWRK = LSCALE
|
||||
WORK( 1 ) = LSCALE + LANRM
|
||||
*
|
||||
* Test the input parameters
|
||||
*
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
|
||||
$ LSAME( TRANS, 'C' ) ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
|
||||
INFO = -3
|
||||
ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
|
||||
$ LSAME( NORMIN, 'N' ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
INFO = -5
|
||||
ELSE IF( NRHS.LT.0 ) THEN
|
||||
INFO = -6
|
||||
ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -8
|
||||
ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -10
|
||||
ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
|
||||
INFO = -14
|
||||
END IF
|
||||
IF( INFO.NE.0 ) THEN
|
||||
CALL XERBLA( 'DLATRS3', -INFO )
|
||||
RETURN
|
||||
ELSE IF( LQUERY ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Initialize scaling factors
|
||||
*
|
||||
DO KK = 1, NRHS
|
||||
SCALE( KK ) = ONE
|
||||
END DO
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( MIN( N, NRHS ).EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Determine machine dependent constant to control overflow.
|
||||
*
|
||||
BIGNUM = DLAMCH( 'Overflow' )
|
||||
SMLNUM = DLAMCH( 'Safe Minimum' )
|
||||
*
|
||||
* Use unblocked code for small problems
|
||||
*
|
||||
IF( NRHS.LT.NRHSMIN ) THEN
|
||||
CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
|
||||
$ SCALE( 1 ), CNORM, INFO )
|
||||
DO K = 2, NRHS
|
||||
CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Compute norms of blocks of A excluding diagonal blocks and find
|
||||
* the block with the largest norm TMAX.
|
||||
*
|
||||
TMAX = ZERO
|
||||
DO J = 1, NBA
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
IF ( UPPER ) THEN
|
||||
IFIRST = 1
|
||||
ILAST = J - 1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
END IF
|
||||
DO I = IFIRST, ILAST
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Compute upper bound of A( I1:I2-1, J1:J2-1 ).
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + I+(J-1)*NBA ) = ANRM
|
||||
ELSE
|
||||
ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + J+(I-1)*NBA ) = ANRM
|
||||
END IF
|
||||
TMAX = MAX( TMAX, ANRM )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN
|
||||
*
|
||||
* Some matrix entries have huge absolute value. At least one upper
|
||||
* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
|
||||
* number, either due to overflow in LANGE or due to Inf in A.
|
||||
* Fall back to LATRS. Set normin = 'N' for every right-hand side to
|
||||
* force computation of TSCAL in LATRS to avoid the likely overflow
|
||||
* in the computation of the column norms CNORM.
|
||||
*
|
||||
DO K = 1, NRHS
|
||||
CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Every right-hand side requires workspace to store NBA local scale
|
||||
* factors. To save workspace, X is computed successively in block columns
|
||||
* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
|
||||
* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
|
||||
DO K = 1, NBX
|
||||
* Loop over block columns (index = K) of X and, for column-wise scalings,
|
||||
* over individual columns (index = KK).
|
||||
* K1: column index of the first column in X( J, K )
|
||||
* K2: column index of the first column in X( J, K+1 )
|
||||
* so the K2 - K1 is the column count of the block X( J, K )
|
||||
K1 = (K-1)*NBRHS + 1
|
||||
K2 = MIN( K*NBRHS, NRHS ) + 1
|
||||
*
|
||||
* Initialize local scaling factors of current block column X( J, K )
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
DO I = 1, NBA
|
||||
WORK( I+KK*LDS ) = ONE
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
ELSE
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
ELSE
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
DO J = JFIRST, JLAST, JINC
|
||||
* J1: row index of the first row in A( J, J )
|
||||
* J2: row index of the first row in A( J+1, J+1 )
|
||||
* so that J2 - J1 is the row count of the block A( J, J )
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
*
|
||||
* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
|
||||
* for all right-hand sides in the current block column,
|
||||
* one RHS at a time.
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( KK.EQ.1 ) THEN
|
||||
CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
ELSE
|
||||
CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
END IF
|
||||
* Find largest absolute value entry in the vector segment
|
||||
* X( J1:J2-1, RHS ) as an upper bound for the worst case
|
||||
* growth in the linear updates.
|
||||
XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ),
|
||||
$ LDX, W )
|
||||
*
|
||||
IF( SCALOC .EQ. ZERO ) THEN
|
||||
* LATRS found that A is singular through A(j,j) = 0.
|
||||
* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
|
||||
* and compute A*x = 0 (or A**T*x = 0). Note that
|
||||
* X(J1:J2-1, KK) is set by LATRS.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, J1-1
|
||||
X( II, KK ) = ZERO
|
||||
END DO
|
||||
DO II = J2, N
|
||||
X( II, KK ) = ZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN
|
||||
* LATRS computed a valid scale factor, but combined with
|
||||
* the current scaling the solution does not have a
|
||||
* scale factor > 0.
|
||||
*
|
||||
* Set WORK( J+KK*LDS ) to smallest valid scale
|
||||
* factor and increase SCALOC accordingly.
|
||||
SCAL = WORK( J+KK*LDS ) / SMLNUM
|
||||
SCALOC = SCALOC * SCAL
|
||||
WORK( J+KK*LDS ) = SMLNUM
|
||||
* If LATRS overestimated the growth, x may be
|
||||
* rescaled to preserve a valid combined scale
|
||||
* factor WORK( J, KK ) > 0.
|
||||
RSCAL = ONE / SCALOC
|
||||
IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN
|
||||
XNRM( KK ) = XNRM( KK ) * RSCAL
|
||||
CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
|
||||
SCALOC = ONE
|
||||
ELSE
|
||||
* The system op(A) * x = b is badly scaled and its
|
||||
* solution cannot be represented as (1/scale) * x.
|
||||
* Set x to zero. This approach deviates from LATRS
|
||||
* where a completely meaningless non-zero vector
|
||||
* is returned that is not a solution to op(A) * x = b.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, N
|
||||
X( II, KK ) = ZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
END IF
|
||||
END IF
|
||||
SCALOC = SCALOC * WORK( J+KK*LDS )
|
||||
WORK( J+KK*LDS ) = SCALOC
|
||||
END DO
|
||||
*
|
||||
* Linear block updates
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
ELSE
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
DO I = IFIRST, ILAST, IINC
|
||||
* I1: row index of the first column in X( I, K )
|
||||
* I2: row index of the first column in X( I+1, K )
|
||||
* so the I2 - I1 is the row count of the block X( I, K )
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Prepare the linear update to be executed with GEMM.
|
||||
* For each column, compute a consistent scaling, a
|
||||
* scaling factor to survive the linear update, and
|
||||
* rescale the column segments, if necesssary. Then
|
||||
* the linear update is safely executed.
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
* Compute consistent scaling
|
||||
SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) )
|
||||
*
|
||||
* Compute scaling factor to survive the linear update
|
||||
* simulating consistent scaling.
|
||||
*
|
||||
BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
|
||||
BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
|
||||
XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS ))
|
||||
ANRM = WORK( AWRK + I+(J-1)*NBA )
|
||||
SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM )
|
||||
*
|
||||
* Simultaneously apply the robust update factor and the
|
||||
* consistency scaling factor to B( I, KK ) and B( J, KK ).
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
WORK( I+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
|
||||
WORK( J+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
|
||||
*
|
||||
CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE,
|
||||
$ A( I1, J1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ ONE, X( I1, K1 ), LDX )
|
||||
ELSE
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K )
|
||||
*
|
||||
CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE,
|
||||
$ A( J1, I1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ ONE, X( I1, K1 ), LDX )
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Reduce local scaling factors
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
DO I = 1, NBA
|
||||
SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Realize consistent scaling
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
|
||||
DO I = 1, NBA
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
|
||||
IF( SCAL.NE.ONE )
|
||||
$ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
RETURN
|
||||
*
|
||||
* End of DLATRS3
|
||||
*
|
||||
END
|
File diff suppressed because it is too large
Load Diff
|
@ -469,6 +469,15 @@
|
|||
ELSE
|
||||
NB = 64
|
||||
END IF
|
||||
ELSE IF( C3.EQ.'SYL' ) THEN
|
||||
* The upper bound is to prevent overly aggressive scaling.
|
||||
IF( SNAME ) THEN
|
||||
NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ),
|
||||
$ 240 )
|
||||
ELSE
|
||||
NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ),
|
||||
$ 80 )
|
||||
END IF
|
||||
END IF
|
||||
ELSE IF( C2.EQ.'LA' ) THEN
|
||||
IF( C3.EQ.'UUM' ) THEN
|
||||
|
@ -477,6 +486,12 @@
|
|||
ELSE
|
||||
NB = 64
|
||||
END IF
|
||||
ELSE IF( C3.EQ.'TRS' ) THEN
|
||||
IF( SNAME ) THEN
|
||||
NB = 32
|
||||
ELSE
|
||||
NB = 32
|
||||
END IF
|
||||
END IF
|
||||
ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN
|
||||
IF( C3.EQ.'EBZ' ) THEN
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
*> \brief \b SLARMM
|
||||
*
|
||||
* Definition:
|
||||
* ===========
|
||||
*
|
||||
* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM )
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
* REAL ANORM, BNORM, CNORM
|
||||
* ..
|
||||
*
|
||||
*> \par Purpose:
|
||||
* =======
|
||||
*>
|
||||
*> \verbatim
|
||||
*>
|
||||
*> SLARMM returns a factor s in (0, 1] such that the linear updates
|
||||
*>
|
||||
*> (s * C) - A * (s * B) and (s * C) - (s * A) * B
|
||||
*>
|
||||
*> cannot overflow, where A, B, and C are matrices of conforming
|
||||
*> dimensions.
|
||||
*>
|
||||
*> This is an auxiliary routine so there is no argument checking.
|
||||
*> \endverbatim
|
||||
*
|
||||
* Arguments:
|
||||
* =========
|
||||
*
|
||||
*> \param[in] ANORM
|
||||
*> \verbatim
|
||||
*> ANORM is REAL
|
||||
*> The infinity norm of A. ANORM >= 0.
|
||||
*> The number of rows of the matrix A. M >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] BNORM
|
||||
*> \verbatim
|
||||
*> BNORM is REAL
|
||||
*> The infinity norm of B. BNORM >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] CNORM
|
||||
*> \verbatim
|
||||
*> CNORM is REAL
|
||||
*> The infinity norm of C. CNORM >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*>
|
||||
* =====================================================================
|
||||
*> References:
|
||||
*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for
|
||||
*> Robust Solution of Triangular Linear Systems. In: International
|
||||
*> Conference on Parallel Processing and Applied Mathematics, pages
|
||||
*> 68--78. Springer, 2017.
|
||||
*>
|
||||
*> \ingroup OTHERauxiliary
|
||||
* =====================================================================
|
||||
|
||||
REAL FUNCTION SLARMM( ANORM, BNORM, CNORM )
|
||||
IMPLICIT NONE
|
||||
* .. Scalar Arguments ..
|
||||
REAL ANORM, BNORM, CNORM
|
||||
* .. Parameters ..
|
||||
REAL ONE, HALF, FOUR
|
||||
PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 )
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
REAL BIGNUM, SMLNUM
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
REAL SLAMCH
|
||||
EXTERNAL SLAMCH
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
*
|
||||
* Determine machine dependent parameters to control overflow.
|
||||
*
|
||||
SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
|
||||
BIGNUM = ( ONE / SMLNUM ) / FOUR
|
||||
*
|
||||
* Compute a scale factor.
|
||||
*
|
||||
SLARMM = ONE
|
||||
IF( BNORM .LE. ONE ) THEN
|
||||
IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN
|
||||
SLARMM = HALF
|
||||
END IF
|
||||
ELSE
|
||||
IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN
|
||||
SLARMM = HALF / BNORM
|
||||
END IF
|
||||
END IF
|
||||
RETURN
|
||||
*
|
||||
* ==== End of SLARMM ====
|
||||
*
|
||||
END
|
|
@ -0,0 +1,656 @@
|
|||
*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
|
||||
*
|
||||
* Definition:
|
||||
* ===========
|
||||
*
|
||||
* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
* X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
* CHARACTER DIAG, NORMIN, TRANS, UPLO
|
||||
* INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
* REAL A( LDA, * ), CNORM( * ), SCALE( * ),
|
||||
* WORK( * ), X( LDX, * )
|
||||
* ..
|
||||
*
|
||||
*
|
||||
*> \par Purpose:
|
||||
* =============
|
||||
*>
|
||||
*> \verbatim
|
||||
*>
|
||||
*> SLATRS3 solves one of the triangular systems
|
||||
*>
|
||||
*> A * X = B * diag(scale) or A**T * X = B * diag(scale)
|
||||
*>
|
||||
*> with scaling to prevent overflow. Here A is an upper or lower
|
||||
*> triangular matrix, A**T denotes the transpose of A. X and B are
|
||||
*> n by nrhs matrices and scale is an nrhs element vector of scaling
|
||||
*> factors. A scaling factor scale(j) is usually less than or equal
|
||||
*> to 1, chosen such that X(:,j) is less than the overflow threshold.
|
||||
*> If the matrix A is singular (A(j,j) = 0 for some j), then
|
||||
*> a non-trivial solution to A*X = 0 is returned. If the system is
|
||||
*> so badly scaled that the solution cannot be represented as
|
||||
*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
|
||||
*>
|
||||
*> This is a BLAS-3 version of LATRS for solving several right
|
||||
*> hand sides simultaneously.
|
||||
*>
|
||||
*> \endverbatim
|
||||
*
|
||||
* Arguments:
|
||||
* ==========
|
||||
*
|
||||
*> \param[in] UPLO
|
||||
*> \verbatim
|
||||
*> UPLO is CHARACTER*1
|
||||
*> Specifies whether the matrix A is upper or lower triangular.
|
||||
*> = 'U': Upper triangular
|
||||
*> = 'L': Lower triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] TRANS
|
||||
*> \verbatim
|
||||
*> TRANS is CHARACTER*1
|
||||
*> Specifies the operation applied to A.
|
||||
*> = 'N': Solve A * x = s*b (No transpose)
|
||||
*> = 'T': Solve A**T* x = s*b (Transpose)
|
||||
*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose)
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] DIAG
|
||||
*> \verbatim
|
||||
*> DIAG is CHARACTER*1
|
||||
*> Specifies whether or not the matrix A is unit triangular.
|
||||
*> = 'N': Non-unit triangular
|
||||
*> = 'U': Unit triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NORMIN
|
||||
*> \verbatim
|
||||
*> NORMIN is CHARACTER*1
|
||||
*> Specifies whether CNORM has been set or not.
|
||||
*> = 'Y': CNORM contains the column norms on entry
|
||||
*> = 'N': CNORM is not set on entry. On exit, the norms will
|
||||
*> be computed and stored in CNORM.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] N
|
||||
*> \verbatim
|
||||
*> N is INTEGER
|
||||
*> The order of the matrix A. N >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NRHS
|
||||
*> \verbatim
|
||||
*> NRHS is INTEGER
|
||||
*> The number of columns of X. NRHS >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] A
|
||||
*> \verbatim
|
||||
*> A is REAL array, dimension (LDA,N)
|
||||
*> The triangular matrix A. If UPLO = 'U', the leading n by n
|
||||
*> upper triangular part of the array A contains the upper
|
||||
*> triangular matrix, and the strictly lower triangular part of
|
||||
*> A is not referenced. If UPLO = 'L', the leading n by n lower
|
||||
*> triangular part of the array A contains the lower triangular
|
||||
*> matrix, and the strictly upper triangular part of A is not
|
||||
*> referenced. If DIAG = 'U', the diagonal elements of A are
|
||||
*> also not referenced and are assumed to be 1.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDA
|
||||
*> \verbatim
|
||||
*> LDA is INTEGER
|
||||
*> The leading dimension of the array A. LDA >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] X
|
||||
*> \verbatim
|
||||
*> X is REAL array, dimension (LDX,NRHS)
|
||||
*> On entry, the right hand side B of the triangular system.
|
||||
*> On exit, X is overwritten by the solution matrix X.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDX
|
||||
*> \verbatim
|
||||
*> LDX is INTEGER
|
||||
*> The leading dimension of the array X. LDX >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] SCALE
|
||||
*> \verbatim
|
||||
*> SCALE is REAL array, dimension (NRHS)
|
||||
*> The scaling factor s(k) is for the triangular system
|
||||
*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k).
|
||||
*> If SCALE = 0, the matrix A is singular or badly scaled.
|
||||
*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
|
||||
*> that is an exact or approximate solution to A*x(:,k) = 0
|
||||
*> is returned. If the system so badly scaled that solution
|
||||
*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
|
||||
*> is returned.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] CNORM
|
||||
*> \verbatim
|
||||
*> CNORM is REAL array, dimension (N)
|
||||
*>
|
||||
*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
|
||||
*> contains the norm of the off-diagonal part of the j-th column
|
||||
*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal
|
||||
*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
|
||||
*> must be greater than or equal to the 1-norm.
|
||||
*>
|
||||
*> If NORMIN = 'N', CNORM is an output argument and CNORM(j)
|
||||
*> returns the 1-norm of the offdiagonal part of the j-th column
|
||||
*> of A.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] WORK
|
||||
*> \verbatim
|
||||
*> WORK is REAL array, dimension (LWORK).
|
||||
*> On exit, if INFO = 0, WORK(1) returns the optimal size of
|
||||
*> WORK.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LWORK
|
||||
*> LWORK is INTEGER
|
||||
*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
|
||||
*> NBA = (N + NB - 1)/NB and NB is the optimal block size.
|
||||
*>
|
||||
*> If LWORK = -1, then a workspace query is assumed; the routine
|
||||
*> only calculates the optimal dimensions of the WORK array, returns
|
||||
*> this value as the first entry of the WORK array, and no error
|
||||
*> message related to LWORK is issued by XERBLA.
|
||||
*>
|
||||
*> \param[out] INFO
|
||||
*> \verbatim
|
||||
*> INFO is INTEGER
|
||||
*> = 0: successful exit
|
||||
*> < 0: if INFO = -k, the k-th argument had an illegal value
|
||||
*> \endverbatim
|
||||
*
|
||||
* Authors:
|
||||
* ========
|
||||
*
|
||||
*> \author Univ. of Tennessee
|
||||
*> \author Univ. of California Berkeley
|
||||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \ingroup doubleOTHERauxiliary
|
||||
*> \par Further Details:
|
||||
* =====================
|
||||
* \verbatim
|
||||
* The algorithm follows the structure of a block triangular solve.
|
||||
* The diagonal block is solved with a call to the robust the triangular
|
||||
* solver LATRS for every right-hand side RHS = 1, ..., NRHS
|
||||
* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
|
||||
* where op( A ) = A or op( A ) = A**T.
|
||||
* The linear block updates operate on block columns of X,
|
||||
* B( I, K ) - op(A( I, J )) * X( J, K )
|
||||
* and use GEMM. To avoid overflow in the linear block update, the worst case
|
||||
* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
|
||||
* such that
|
||||
* || s * B( I, RHS )||_oo
|
||||
* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold
|
||||
*
|
||||
* Once all columns of a block column have been rescaled (BLAS-1), the linear
|
||||
* update is executed with GEMM without overflow.
|
||||
*
|
||||
* To limit rescaling, local scale factors track the scaling of column segments.
|
||||
* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
|
||||
* per right-hand side column RHS = 1, ..., NRHS. The global scale factor
|
||||
* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
|
||||
* I = 1, ..., NBA.
|
||||
* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
|
||||
* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
|
||||
* linear update of potentially inconsistently scaled vector segments
|
||||
* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
|
||||
* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
|
||||
* if necessary, rescales the blocks prior to calling GEMM.
|
||||
*
|
||||
* \endverbatim
|
||||
* =====================================================================
|
||||
* References:
|
||||
* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
|
||||
* Parallel robust solution of triangular linear systems. Concurrency
|
||||
* and Computation: Practice and Experience, 31(19), e5064.
|
||||
*
|
||||
* Contributor:
|
||||
* Angelika Schwarz, Umea University, Sweden.
|
||||
*
|
||||
* =====================================================================
|
||||
SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
$ X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
IMPLICIT NONE
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
CHARACTER DIAG, TRANS, NORMIN, UPLO
|
||||
INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
REAL A( LDA, * ), CNORM( * ), X( LDX, * ),
|
||||
$ SCALE( * ), WORK( * )
|
||||
* ..
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
REAL ZERO, ONE
|
||||
PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 )
|
||||
INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN
|
||||
PARAMETER ( NRHSMIN = 2, NBRHS = 32 )
|
||||
PARAMETER ( NBMIN = 8, NBMAX = 64 )
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
REAL W( NBMAX ), XNRM( NBRHS )
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER
|
||||
INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
|
||||
$ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
|
||||
$ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
|
||||
REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
|
||||
$ SCAMIN, SMLNUM, TMAX
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
REAL SLAMCH, SLANGE, SLARMM
|
||||
EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL SLATRS, SSCAL, XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC ABS, MAX, MIN
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
NOTRAN = LSAME( TRANS, 'N' )
|
||||
NOUNIT = LSAME( DIAG, 'N' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
*
|
||||
* Partition A and X into blocks.
|
||||
*
|
||||
NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) )
|
||||
NB = MIN( NBMAX, NB )
|
||||
NBA = MAX( 1, (N + NB - 1) / NB )
|
||||
NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
|
||||
*
|
||||
* Compute the workspace
|
||||
*
|
||||
* The workspace comprises two parts.
|
||||
* The first part stores the local scale factors. Each simultaneously
|
||||
* computed right-hand side requires one local scale factor per block
|
||||
* row. WORK( I + KK * LDS ) is the scale factor of the vector
|
||||
* segment associated with the I-th block row and the KK-th vector
|
||||
* in the block column.
|
||||
LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
|
||||
LDS = NBA
|
||||
* The second part stores upper bounds of the triangular A. There are
|
||||
* a total of NBA x NBA blocks, of which only the upper triangular
|
||||
* part or the lower triangular part is referenced. The upper bound of
|
||||
* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
|
||||
LANRM = NBA * NBA
|
||||
AWRK = LSCALE
|
||||
WORK( 1 ) = LSCALE + LANRM
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
|
||||
$ LSAME( TRANS, 'C' ) ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
|
||||
INFO = -3
|
||||
ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
|
||||
$ LSAME( NORMIN, 'N' ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
INFO = -5
|
||||
ELSE IF( NRHS.LT.0 ) THEN
|
||||
INFO = -6
|
||||
ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -8
|
||||
ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -10
|
||||
ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
|
||||
INFO = -14
|
||||
END IF
|
||||
IF( INFO.NE.0 ) THEN
|
||||
CALL XERBLA( 'SLATRS3', -INFO )
|
||||
RETURN
|
||||
ELSE IF( LQUERY ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Initialize scaling factors
|
||||
*
|
||||
DO KK = 1, NRHS
|
||||
SCALE( KK ) = ONE
|
||||
END DO
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( MIN( N, NRHS ).EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Determine machine dependent constant to control overflow.
|
||||
*
|
||||
BIGNUM = SLAMCH( 'Overflow' )
|
||||
SMLNUM = SLAMCH( 'Safe Minimum' )
|
||||
*
|
||||
* Use unblocked code for small problems
|
||||
*
|
||||
IF( NRHS.LT.NRHSMIN ) THEN
|
||||
CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
|
||||
$ SCALE( 1 ), CNORM, INFO )
|
||||
DO K = 2, NRHS
|
||||
CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Compute norms of blocks of A excluding diagonal blocks and find
|
||||
* the block with the largest norm TMAX.
|
||||
*
|
||||
TMAX = ZERO
|
||||
DO J = 1, NBA
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
IF ( UPPER ) THEN
|
||||
IFIRST = 1
|
||||
ILAST = J - 1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
END IF
|
||||
DO I = IFIRST, ILAST
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Compute upper bound of A( I1:I2-1, J1:J2-1 ).
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + I+(J-1)*NBA ) = ANRM
|
||||
ELSE
|
||||
ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + J+(I-1)*NBA ) = ANRM
|
||||
END IF
|
||||
TMAX = MAX( TMAX, ANRM )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN
|
||||
*
|
||||
* Some matrix entries have huge absolute value. At least one upper
|
||||
* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
|
||||
* number, either due to overflow in LANGE or due to Inf in A.
|
||||
* Fall back to LATRS. Set normin = 'N' for every right-hand side to
|
||||
* force computation of TSCAL in LATRS to avoid the likely overflow
|
||||
* in the computation of the column norms CNORM.
|
||||
*
|
||||
DO K = 1, NRHS
|
||||
CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Every right-hand side requires workspace to store NBA local scale
|
||||
* factors. To save workspace, X is computed successively in block columns
|
||||
* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
|
||||
* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
|
||||
DO K = 1, NBX
|
||||
* Loop over block columns (index = K) of X and, for column-wise scalings,
|
||||
* over individual columns (index = KK).
|
||||
* K1: column index of the first column in X( J, K )
|
||||
* K2: column index of the first column in X( J, K+1 )
|
||||
* so the K2 - K1 is the column count of the block X( J, K )
|
||||
K1 = (K-1)*NBRHS + 1
|
||||
K2 = MIN( K*NBRHS, NRHS ) + 1
|
||||
*
|
||||
* Initialize local scaling factors of current block column X( J, K )
|
||||
*
|
||||
DO KK = 1, K2 - K1
|
||||
DO I = 1, NBA
|
||||
WORK( I+KK*LDS ) = ONE
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
ELSE
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
ELSE
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
DO J = JFIRST, JLAST, JINC
|
||||
* J1: row index of the first row in A( J, J )
|
||||
* J2: row index of the first row in A( J+1, J+1 )
|
||||
* so that J2 - J1 is the row count of the block A( J, J )
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
*
|
||||
* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
|
||||
* for all right-hand sides in the current block column,
|
||||
* one RHS at a time.
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( KK.EQ.1 ) THEN
|
||||
CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
ELSE
|
||||
CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
END IF
|
||||
* Find largest absolute value entry in the vector segment
|
||||
* X( J1:J2-1, RHS ) as an upper bound for the worst case
|
||||
* growth in the linear updates.
|
||||
XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ),
|
||||
$ LDX, W )
|
||||
*
|
||||
IF( SCALOC .EQ. ZERO ) THEN
|
||||
* LATRS found that A is singular through A(j,j) = 0.
|
||||
* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
|
||||
* and compute A*x = 0 (or A**T*x = 0). Note that
|
||||
* X(J1:J2-1, KK) is set by LATRS.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, J1-1
|
||||
X( II, KK ) = ZERO
|
||||
END DO
|
||||
DO II = J2, N
|
||||
X( II, KK ) = ZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
|
||||
* LATRS computed a valid scale factor, but combined with
|
||||
* the current scaling the solution does not have a
|
||||
* scale factor > 0.
|
||||
*
|
||||
* Set WORK( J+KK*LDS ) to smallest valid scale
|
||||
* factor and increase SCALOC accordingly.
|
||||
SCAL = WORK( J+KK*LDS ) / SMLNUM
|
||||
SCALOC = SCALOC * SCAL
|
||||
WORK( J+KK*LDS ) = SMLNUM
|
||||
* If LATRS overestimated the growth, x may be
|
||||
* rescaled to preserve a valid combined scale
|
||||
* factor WORK( J, KK ) > 0.
|
||||
RSCAL = ONE / SCALOC
|
||||
IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
|
||||
XNRM( KK ) = XNRM( KK ) * RSCAL
|
||||
CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
|
||||
SCALOC = ONE
|
||||
ELSE
|
||||
* The system op(A) * x = b is badly scaled and its
|
||||
* solution cannot be represented as (1/scale) * x.
|
||||
* Set x to zero. This approach deviates from LATRS
|
||||
* where a completely meaningless non-zero vector
|
||||
* is returned that is not a solution to op(A) * x = b.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, N
|
||||
X( II, KK ) = ZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
END IF
|
||||
END IF
|
||||
SCALOC = SCALOC * WORK( J+KK*LDS )
|
||||
WORK( J+KK*LDS ) = SCALOC
|
||||
END DO
|
||||
*
|
||||
* Linear block updates
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
ELSE
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
DO I = IFIRST, ILAST, IINC
|
||||
* I1: row index of the first column in X( I, K )
|
||||
* I2: row index of the first column in X( I+1, K )
|
||||
* so the I2 - I1 is the row count of the block X( I, K )
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Prepare the linear update to be executed with GEMM.
|
||||
* For each column, compute a consistent scaling, a
|
||||
* scaling factor to survive the linear update, and
|
||||
* rescale the column segments, if necesssary. Then
|
||||
* the linear update is safely executed.
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
* Compute consistent scaling
|
||||
SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
|
||||
*
|
||||
* Compute scaling factor to survive the linear update
|
||||
* simulating consistent scaling.
|
||||
*
|
||||
BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
|
||||
BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
|
||||
XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS ))
|
||||
ANRM = WORK( AWRK + I+(J-1)*NBA )
|
||||
SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM )
|
||||
*
|
||||
* Simultaneously apply the robust update factor and the
|
||||
* consistency scaling factor to B( I, KK ) and B( J, KK ).
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
WORK( I+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
|
||||
WORK( J+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
|
||||
*
|
||||
CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE,
|
||||
$ A( I1, J1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ ONE, X( I1, K1 ), LDX )
|
||||
ELSE
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
|
||||
*
|
||||
CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE,
|
||||
$ A( J1, I1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ ONE, X( I1, K1 ), LDX )
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Reduce local scaling factors
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
DO I = 1, NBA
|
||||
SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Realize consistent scaling
|
||||
*
|
||||
DO KK = 1, K2-K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
|
||||
DO I = 1, NBA
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
|
||||
IF( SCAL.NE.ONE )
|
||||
$ CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
RETURN
|
||||
*
|
||||
* End of SLATRS3
|
||||
*
|
||||
END
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,667 @@
|
|||
*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
|
||||
*
|
||||
* Definition:
|
||||
* ===========
|
||||
*
|
||||
* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
* X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
* CHARACTER DIAG, NORMIN, TRANS, UPLO
|
||||
* INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * )
|
||||
* COMPLEX*16 A( LDA, * ), X( LDX, * )
|
||||
* ..
|
||||
*
|
||||
*
|
||||
*> \par Purpose:
|
||||
* =============
|
||||
*>
|
||||
*> \verbatim
|
||||
*>
|
||||
*> ZLATRS3 solves one of the triangular systems
|
||||
*>
|
||||
*> A * X = B * diag(scale), A**T * X = B * diag(scale), or
|
||||
*> A**H * X = B * diag(scale)
|
||||
*>
|
||||
*> with scaling to prevent overflow. Here A is an upper or lower
|
||||
*> triangular matrix, A**T denotes the transpose of A, A**H denotes the
|
||||
*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
|
||||
*> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
|
||||
*> is usually less than or equal to 1, chosen such that X(:,j) is less
|
||||
*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
|
||||
*> for some j), then a non-trivial solution to A*X = 0 is returned. If
|
||||
*> the system is so badly scaled that the solution cannot be represented
|
||||
*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
|
||||
*>
|
||||
*> This is a BLAS-3 version of LATRS for solving several right
|
||||
*> hand sides simultaneously.
|
||||
*>
|
||||
*> \endverbatim
|
||||
*
|
||||
* Arguments:
|
||||
* ==========
|
||||
*
|
||||
*> \param[in] UPLO
|
||||
*> \verbatim
|
||||
*> UPLO is CHARACTER*1
|
||||
*> Specifies whether the matrix A is upper or lower triangular.
|
||||
*> = 'U': Upper triangular
|
||||
*> = 'L': Lower triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] TRANS
|
||||
*> \verbatim
|
||||
*> TRANS is CHARACTER*1
|
||||
*> Specifies the operation applied to A.
|
||||
*> = 'N': Solve A * x = s*b (No transpose)
|
||||
*> = 'T': Solve A**T* x = s*b (Transpose)
|
||||
*> = 'C': Solve A**T* x = s*b (Conjugate transpose)
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] DIAG
|
||||
*> \verbatim
|
||||
*> DIAG is CHARACTER*1
|
||||
*> Specifies whether or not the matrix A is unit triangular.
|
||||
*> = 'N': Non-unit triangular
|
||||
*> = 'U': Unit triangular
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NORMIN
|
||||
*> \verbatim
|
||||
*> NORMIN is CHARACTER*1
|
||||
*> Specifies whether CNORM has been set or not.
|
||||
*> = 'Y': CNORM contains the column norms on entry
|
||||
*> = 'N': CNORM is not set on entry. On exit, the norms will
|
||||
*> be computed and stored in CNORM.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] N
|
||||
*> \verbatim
|
||||
*> N is INTEGER
|
||||
*> The order of the matrix A. N >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] NRHS
|
||||
*> \verbatim
|
||||
*> NRHS is INTEGER
|
||||
*> The number of columns of X. NRHS >= 0.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] A
|
||||
*> \verbatim
|
||||
*> A is COMPLEX*16 array, dimension (LDA,N)
|
||||
*> The triangular matrix A. If UPLO = 'U', the leading n by n
|
||||
*> upper triangular part of the array A contains the upper
|
||||
*> triangular matrix, and the strictly lower triangular part of
|
||||
*> A is not referenced. If UPLO = 'L', the leading n by n lower
|
||||
*> triangular part of the array A contains the lower triangular
|
||||
*> matrix, and the strictly upper triangular part of A is not
|
||||
*> referenced. If DIAG = 'U', the diagonal elements of A are
|
||||
*> also not referenced and are assumed to be 1.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDA
|
||||
*> \verbatim
|
||||
*> LDA is INTEGER
|
||||
*> The leading dimension of the array A. LDA >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] X
|
||||
*> \verbatim
|
||||
*> X is COMPLEX*16 array, dimension (LDX,NRHS)
|
||||
*> On entry, the right hand side B of the triangular system.
|
||||
*> On exit, X is overwritten by the solution matrix X.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LDX
|
||||
*> \verbatim
|
||||
*> LDX is INTEGER
|
||||
*> The leading dimension of the array X. LDX >= max (1,N).
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] SCALE
|
||||
*> \verbatim
|
||||
*> SCALE is DOUBLE PRECISION array, dimension (NRHS)
|
||||
*> The scaling factor s(k) is for the triangular system
|
||||
*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k).
|
||||
*> If SCALE = 0, the matrix A is singular or badly scaled.
|
||||
*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
|
||||
*> that is an exact or approximate solution to A*x(:,k) = 0
|
||||
*> is returned. If the system so badly scaled that solution
|
||||
*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
|
||||
*> is returned.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in,out] CNORM
|
||||
*> \verbatim
|
||||
*> CNORM is DOUBLE PRECISION array, dimension (N)
|
||||
*>
|
||||
*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
|
||||
*> contains the norm of the off-diagonal part of the j-th column
|
||||
*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal
|
||||
*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
|
||||
*> must be greater than or equal to the 1-norm.
|
||||
*>
|
||||
*> If NORMIN = 'N', CNORM is an output argument and CNORM(j)
|
||||
*> returns the 1-norm of the offdiagonal part of the j-th column
|
||||
*> of A.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[out] WORK
|
||||
*> \verbatim
|
||||
*> WORK is DOUBLE PRECISION array, dimension (LWORK).
|
||||
*> On exit, if INFO = 0, WORK(1) returns the optimal size of
|
||||
*> WORK.
|
||||
*> \endverbatim
|
||||
*>
|
||||
*> \param[in] LWORK
|
||||
*> LWORK is INTEGER
|
||||
*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
|
||||
*> NBA = (N + NB - 1)/NB and NB is the optimal block size.
|
||||
*>
|
||||
*> If LWORK = -1, then a workspace query is assumed; the routine
|
||||
*> only calculates the optimal dimensions of the WORK array, returns
|
||||
*> this value as the first entry of the WORK array, and no error
|
||||
*> message related to LWORK is issued by XERBLA.
|
||||
*>
|
||||
*> \param[out] INFO
|
||||
*> \verbatim
|
||||
*> INFO is INTEGER
|
||||
*> = 0: successful exit
|
||||
*> < 0: if INFO = -k, the k-th argument had an illegal value
|
||||
*> \endverbatim
|
||||
*
|
||||
* Authors:
|
||||
* ========
|
||||
*
|
||||
*> \author Univ. of Tennessee
|
||||
*> \author Univ. of California Berkeley
|
||||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \ingroup doubleOTHERauxiliary
|
||||
*> \par Further Details:
|
||||
* =====================
|
||||
* \verbatim
|
||||
* The algorithm follows the structure of a block triangular solve.
|
||||
* The diagonal block is solved with a call to the robust the triangular
|
||||
* solver LATRS for every right-hand side RHS = 1, ..., NRHS
|
||||
* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
|
||||
* where op( A ) = A or op( A ) = A**T or op( A ) = A**H.
|
||||
* The linear block updates operate on block columns of X,
|
||||
* B( I, K ) - op(A( I, J )) * X( J, K )
|
||||
* and use GEMM. To avoid overflow in the linear block update, the worst case
|
||||
* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
|
||||
* such that
|
||||
* || s * B( I, RHS )||_oo
|
||||
* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold
|
||||
*
|
||||
* Once all columns of a block column have been rescaled (BLAS-1), the linear
|
||||
* update is executed with GEMM without overflow.
|
||||
*
|
||||
* To limit rescaling, local scale factors track the scaling of column segments.
|
||||
* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
|
||||
* per right-hand side column RHS = 1, ..., NRHS. The global scale factor
|
||||
* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
|
||||
* I = 1, ..., NBA.
|
||||
* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
|
||||
* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
|
||||
* linear update of potentially inconsistently scaled vector segments
|
||||
* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
|
||||
* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
|
||||
* if necessary, rescales the blocks prior to calling GEMM.
|
||||
*
|
||||
* \endverbatim
|
||||
* =====================================================================
|
||||
* References:
|
||||
* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
|
||||
* Parallel robust solution of triangular linear systems. Concurrency
|
||||
* and Computation: Practice and Experience, 31(19), e5064.
|
||||
*
|
||||
* Contributor:
|
||||
* Angelika Schwarz, Umea University, Sweden.
|
||||
*
|
||||
* =====================================================================
|
||||
SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
|
||||
$ X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
|
||||
IMPLICIT NONE
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
CHARACTER DIAG, TRANS, NORMIN, UPLO
|
||||
INTEGER INFO, LDA, LWORK, LDX, N, NRHS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX*16 A( LDA, * ), X( LDX, * )
|
||||
DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * )
|
||||
* ..
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ZERO, ONE
|
||||
PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
|
||||
COMPLEX*16 CZERO, CONE
|
||||
PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) )
|
||||
PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) )
|
||||
INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN
|
||||
PARAMETER ( NRHSMIN = 2, NBRHS = 32 )
|
||||
PARAMETER ( NBMIN = 8, NBMAX = 64 )
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS )
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER
|
||||
INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
|
||||
$ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
|
||||
$ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
|
||||
DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
|
||||
$ SCAMIN, SMLNUM, TMAX
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
DOUBLE PRECISION DLAMCH, ZLANGE, DLARMM
|
||||
EXTERNAL ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL ZLATRS, ZDSCAL, XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC ABS, MAX, MIN
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
NOTRAN = LSAME( TRANS, 'N' )
|
||||
NOUNIT = LSAME( DIAG, 'N' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
*
|
||||
* Partition A and X into blocks.
|
||||
*
|
||||
NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) )
|
||||
NB = MIN( NBMAX, NB )
|
||||
NBA = MAX( 1, (N + NB - 1) / NB )
|
||||
NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
|
||||
*
|
||||
* Compute the workspace
|
||||
*
|
||||
* The workspace comprises two parts.
|
||||
* The first part stores the local scale factors. Each simultaneously
|
||||
* computed right-hand side requires one local scale factor per block
|
||||
* row. WORK( I + KK * LDS ) is the scale factor of the vector
|
||||
* segment associated with the I-th block row and the KK-th vector
|
||||
* in the block column.
|
||||
LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
|
||||
LDS = NBA
|
||||
* The second part stores upper bounds of the triangular A. There are
|
||||
* a total of NBA x NBA blocks, of which only the upper triangular
|
||||
* part or the lower triangular part is referenced. The upper bound of
|
||||
* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
|
||||
LANRM = NBA * NBA
|
||||
AWRK = LSCALE
|
||||
WORK( 1 ) = LSCALE + LANRM
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
|
||||
$ LSAME( TRANS, 'C' ) ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
|
||||
INFO = -3
|
||||
ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
|
||||
$ LSAME( NORMIN, 'N' ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
INFO = -5
|
||||
ELSE IF( NRHS.LT.0 ) THEN
|
||||
INFO = -6
|
||||
ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -8
|
||||
ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
|
||||
INFO = -10
|
||||
ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
|
||||
INFO = -14
|
||||
END IF
|
||||
IF( INFO.NE.0 ) THEN
|
||||
CALL XERBLA( 'ZLATRS3', -INFO )
|
||||
RETURN
|
||||
ELSE IF( LQUERY ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Initialize scaling factors
|
||||
*
|
||||
DO KK = 1, NRHS
|
||||
SCALE( KK ) = ONE
|
||||
END DO
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( MIN( N, NRHS ).EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Determine machine dependent constant to control overflow.
|
||||
*
|
||||
BIGNUM = DLAMCH( 'Overflow' )
|
||||
SMLNUM = DLAMCH( 'Safe Minimum' )
|
||||
*
|
||||
* Use unblocked code for small problems
|
||||
*
|
||||
IF( NRHS.LT.NRHSMIN ) THEN
|
||||
CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
|
||||
$ SCALE( 1 ), CNORM, INFO )
|
||||
DO K = 2, NRHS
|
||||
CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Compute norms of blocks of A excluding diagonal blocks and find
|
||||
* the block with the largest norm TMAX.
|
||||
*
|
||||
TMAX = ZERO
|
||||
DO J = 1, NBA
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
IF ( UPPER ) THEN
|
||||
IFIRST = 1
|
||||
ILAST = J - 1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
END IF
|
||||
DO I = IFIRST, ILAST
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Compute upper bound of A( I1:I2-1, J1:J2-1 ).
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + I+(J-1)*NBA ) = ANRM
|
||||
ELSE
|
||||
ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
|
||||
WORK( AWRK + J+(I-1) * NBA ) = ANRM
|
||||
END IF
|
||||
TMAX = MAX( TMAX, ANRM )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN
|
||||
*
|
||||
* Some matrix entries have huge absolute value. At least one upper
|
||||
* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
|
||||
* number, either due to overflow in LANGE or due to Inf in A.
|
||||
* Fall back to LATRS. Set normin = 'N' for every right-hand side to
|
||||
* force computation of TSCAL in LATRS to avoid the likely overflow
|
||||
* in the computation of the column norms CNORM.
|
||||
*
|
||||
DO K = 1, NRHS
|
||||
CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
|
||||
$ SCALE( K ), CNORM, INFO )
|
||||
END DO
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Every right-hand side requires workspace to store NBA local scale
|
||||
* factors. To save workspace, X is computed successively in block columns
|
||||
* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
|
||||
* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
|
||||
DO K = 1, NBX
|
||||
* Loop over block columns (index = K) of X and, for column-wise scalings,
|
||||
* over individual columns (index = KK).
|
||||
* K1: column index of the first column in X( J, K )
|
||||
* K2: column index of the first column in X( J, K+1 )
|
||||
* so the K2 - K1 is the column count of the block X( J, K )
|
||||
K1 = (K-1)*NBRHS + 1
|
||||
K2 = MIN( K*NBRHS, NRHS ) + 1
|
||||
*
|
||||
* Initialize local scaling factors of current block column X( J, K )
|
||||
*
|
||||
DO KK = 1, K2 - K1
|
||||
DO I = 1, NBA
|
||||
WORK( I+KK*LDS ) = ONE
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
ELSE
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
|
||||
* where op(A) = A**T or op(A) = A**H
|
||||
*
|
||||
IF( UPPER ) THEN
|
||||
JFIRST = 1
|
||||
JLAST = NBA
|
||||
JINC = 1
|
||||
ELSE
|
||||
JFIRST = NBA
|
||||
JLAST = 1
|
||||
JINC = -1
|
||||
END IF
|
||||
END IF
|
||||
|
||||
DO J = JFIRST, JLAST, JINC
|
||||
* J1: row index of the first row in A( J, J )
|
||||
* J2: row index of the first row in A( J+1, J+1 )
|
||||
* so that J2 - J1 is the row count of the block A( J, J )
|
||||
J1 = (J-1)*NB + 1
|
||||
J2 = MIN( J*NB, N ) + 1
|
||||
*
|
||||
* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
|
||||
*
|
||||
DO KK = 1, K2 - K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( KK.EQ.1 ) THEN
|
||||
CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
ELSE
|
||||
CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
|
||||
$ A( J1, J1 ), LDA, X( J1, RHS ),
|
||||
$ SCALOC, CNORM, INFO )
|
||||
END IF
|
||||
* Find largest absolute value entry in the vector segment
|
||||
* X( J1:J2-1, RHS ) as an upper bound for the worst case
|
||||
* growth in the linear updates.
|
||||
XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ),
|
||||
$ LDX, W )
|
||||
*
|
||||
IF( SCALOC .EQ. ZERO ) THEN
|
||||
* LATRS found that A is singular through A(j,j) = 0.
|
||||
* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
|
||||
* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
|
||||
* set by LATRS.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, J1-1
|
||||
X( II, KK ) = CZERO
|
||||
END DO
|
||||
DO II = J2, N
|
||||
X( II, KK ) = CZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
|
||||
* LATRS computed a valid scale factor, but combined with
|
||||
* the current scaling the solution does not have a
|
||||
* scale factor > 0.
|
||||
*
|
||||
* Set WORK( J+KK*LDS ) to smallest valid scale
|
||||
* factor and increase SCALOC accordingly.
|
||||
SCAL = WORK( J+KK*LDS ) / SMLNUM
|
||||
SCALOC = SCALOC * SCAL
|
||||
WORK( J+KK*LDS ) = SMLNUM
|
||||
* If LATRS overestimated the growth, x may be
|
||||
* rescaled to preserve a valid combined scale
|
||||
* factor WORK( J, KK ) > 0.
|
||||
RSCAL = ONE / SCALOC
|
||||
IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
|
||||
XNRM( KK ) = XNRM( KK ) * RSCAL
|
||||
CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
|
||||
SCALOC = ONE
|
||||
ELSE
|
||||
* The system op(A) * x = b is badly scaled and its
|
||||
* solution cannot be represented as (1/scale) * x.
|
||||
* Set x to zero. This approach deviates from LATRS
|
||||
* where a completely meaningless non-zero vector
|
||||
* is returned that is not a solution to op(A) * x = b.
|
||||
SCALE( RHS ) = ZERO
|
||||
DO II = 1, N
|
||||
X( II, KK ) = CZERO
|
||||
END DO
|
||||
* Discard the local scale factors.
|
||||
DO II = 1, NBA
|
||||
WORK( II+KK*LDS ) = ONE
|
||||
END DO
|
||||
SCALOC = ONE
|
||||
END IF
|
||||
END IF
|
||||
SCALOC = SCALOC * WORK( J+KK*LDS )
|
||||
WORK( J+KK*LDS ) = SCALOC
|
||||
END DO
|
||||
*
|
||||
* Linear block updates
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
ELSE
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
END IF
|
||||
ELSE
|
||||
IF( UPPER ) THEN
|
||||
IFIRST = J + 1
|
||||
ILAST = NBA
|
||||
IINC = 1
|
||||
ELSE
|
||||
IFIRST = J - 1
|
||||
ILAST = 1
|
||||
IINC = -1
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
DO I = IFIRST, ILAST, IINC
|
||||
* I1: row index of the first column in X( I, K )
|
||||
* I2: row index of the first column in X( I+1, K )
|
||||
* so the I2 - I1 is the row count of the block X( I, K )
|
||||
I1 = (I-1)*NB + 1
|
||||
I2 = MIN( I*NB, N ) + 1
|
||||
*
|
||||
* Prepare the linear update to be executed with GEMM.
|
||||
* For each column, compute a consistent scaling, a
|
||||
* scaling factor to survive the linear update, and
|
||||
* rescale the column segments, if necesssary. Then
|
||||
* the linear update is safely executed.
|
||||
*
|
||||
DO KK = 1, K2 - K1
|
||||
RHS = K1 + KK - 1
|
||||
* Compute consistent scaling
|
||||
SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
|
||||
*
|
||||
* Compute scaling factor to survive the linear update
|
||||
* simulating consistent scaling.
|
||||
*
|
||||
BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
|
||||
BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
|
||||
XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) )
|
||||
ANRM = WORK( AWRK + I+(J-1)*NBA )
|
||||
SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM )
|
||||
*
|
||||
* Simultaneously apply the robust update factor and the
|
||||
* consistency scaling factor to X( I, KK ) and X( J, KK ).
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
WORK( I+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
*
|
||||
SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
|
||||
IF( SCAL.NE.ONE ) THEN
|
||||
CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
|
||||
WORK( J+KK*LDS ) = SCAMIN*SCALOC
|
||||
END IF
|
||||
END DO
|
||||
*
|
||||
IF( NOTRAN ) THEN
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
|
||||
*
|
||||
CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE,
|
||||
$ A( I1, J1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ CONE, X( I1, K1 ), LDX )
|
||||
ELSE IF( LSAME( TRANS, 'T' ) ) THEN
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
|
||||
*
|
||||
CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE,
|
||||
$ A( J1, I1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ CONE, X( I1, K1 ), LDX )
|
||||
ELSE
|
||||
*
|
||||
* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
|
||||
*
|
||||
CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE,
|
||||
$ A( J1, I1 ), LDA, X( J1, K1 ), LDX,
|
||||
$ CONE, X( I1, K1 ), LDX )
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
|
||||
*
|
||||
* Reduce local scaling factors
|
||||
*
|
||||
DO KK = 1, K2 - K1
|
||||
RHS = K1 + KK - 1
|
||||
DO I = 1, NBA
|
||||
SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
|
||||
END DO
|
||||
END DO
|
||||
*
|
||||
* Realize consistent scaling
|
||||
*
|
||||
DO KK = 1, K2 - K1
|
||||
RHS = K1 + KK - 1
|
||||
IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
|
||||
DO I = 1, NBA
|
||||
I1 = (I - 1) * NB + 1
|
||||
I2 = MIN( I * NB, N ) + 1
|
||||
SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
|
||||
IF( SCAL.NE.ONE )
|
||||
$ CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
RETURN
|
||||
*
|
||||
* End of ZLATRS3
|
||||
*
|
||||
END
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue