From 87d2e314db541342e42040f0d0ec93147fd9fe04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Apr 2021 13:50:55 +0200 Subject: [PATCH 01/10] Import packing improvements in LAPACK xLAQR from Reference-LAPACK PR 480+535 --- lapack-netlib/SRC/chseqr.f | 4 +- lapack-netlib/SRC/claqr0.f | 16 +- lapack-netlib/SRC/claqr4.f | 16 +- lapack-netlib/SRC/claqr5.f | 693 ++++++++++++++++-------------------- lapack-netlib/SRC/dhseqr.f | 4 +- lapack-netlib/SRC/dlaqr0.f | 16 +- lapack-netlib/SRC/dlaqr4.f | 16 +- lapack-netlib/SRC/dlaqr5.f | 674 +++++++++++++++-------------------- lapack-netlib/SRC/shseqr.f | 4 +- lapack-netlib/SRC/slaqr0.f | 16 +- lapack-netlib/SRC/slaqr4.f | 16 +- lapack-netlib/SRC/slaqr5.f | 674 +++++++++++++++-------------------- lapack-netlib/SRC/zhseqr.f | 4 +- lapack-netlib/SRC/zlaqr0.f | 16 +- lapack-netlib/SRC/zlaqr4.f | 16 +- lapack-netlib/SRC/zlaqr5.f | 694 ++++++++++++++++--------------------- 16 files changed, 1238 insertions(+), 1641 deletions(-) diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index cfcf725b2..32b6fa87b 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -320,10 +320,10 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare CLAHQR failure. NL > NTINY = 11 is +* . through a rare CLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f index 2f0ea20db..233721352 100644 --- a/lapack-netlib/SRC/claqr0.f +++ b/lapack-netlib/SRC/claqr0.f @@ -260,7 +260,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -355,22 +355,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -418,7 +418,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -558,7 +558,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAQR4 or * . CLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -659,7 +659,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f index fba286df7..94484e798 100644 --- a/lapack-netlib/SRC/claqr4.f +++ b/lapack-netlib/SRC/claqr4.f @@ -270,7 +270,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -365,22 +365,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -428,7 +428,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -568,7 +568,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -663,7 +663,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index e4317a3ad..71f26d8c9 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: CLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: CLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complexOTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX ALPHA, BETA, CDUM, REFSUM REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL CLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = CONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL CLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL CLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*CONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL CLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 70 CONTINUE * -* ==== Multiply H by reflections from the left ==== +* ==== Perform update from left for subsequent +* . column. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )*H( K+2, J )+ - $ CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,22 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*CONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*CONJG( V( 3, M ) ) - 130 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = CONJG( V( 1, M ) )* + $ ( H( K+1, J )+CONJG( V( 2, M ) )* + $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -687,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE * -* ==== Horizontal Multiply ==== +* ==== Vertical multiply ==== * - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL CLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL CTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL CGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL CLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of CLAQR5 ==== * diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f index b4fc3af90..6b7fb308f 100644 --- a/lapack-netlib/SRC/dhseqr.f +++ b/lapack-netlib/SRC/dhseqr.f @@ -338,10 +338,10 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare DLAHQR failure. NL > NTINY = 11 is +* . through a rare DLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f index f362c096c..8334d8d2b 100644 --- a/lapack-netlib/SRC/dlaqr0.f +++ b/lapack-netlib/SRC/dlaqr0.f @@ -278,7 +278,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -362,22 +362,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -425,7 +425,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -576,7 +576,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAQR4 or * . DLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -698,7 +698,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f index 454bf9608..163e55deb 100644 --- a/lapack-netlib/SRC/dlaqr4.f +++ b/lapack-netlib/SRC/dlaqr4.f @@ -284,7 +284,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -368,22 +368,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -431,7 +431,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -582,7 +582,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index f58db9c89..12e7db637 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: DLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is DOUBLE PRECISION array, dimension (LDU,3*NSHFTS-3) +*> U is DOUBLE PRECISION array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is DOUBLE PRECISION array, dimension (LDWV,3*NSHFTS-3) +*> WV is DOUBLE PRECISION array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup doubleOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE * -* ==== Multiply H by reflections from the left ==== +* ==== Perform update from left for subsequent +* . column. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) - CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, + CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE + 150 CONTINUE * -* ==== Vertical multiply ==== +* ==== Vertical multiply ==== * - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of DLAQR5 ==== * diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index b5707f2c3..d22bd7b94 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -338,10 +338,10 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare SLAHQR failure. NL > NTINY = 11 is +* . through a rare SLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f index 318b46943..b1ebaff75 100644 --- a/lapack-netlib/SRC/slaqr0.f +++ b/lapack-netlib/SRC/slaqr0.f @@ -277,7 +277,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -361,22 +361,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -424,7 +424,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -575,7 +575,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAQR4 or * . SLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index cd642e07f..4ba2f8757 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -287,7 +287,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -371,22 +371,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -434,7 +434,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -585,7 +585,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -700,7 +700,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index f04ee577e..d60a1d3c0 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: SLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: SLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is REAL array, dimension (LDU,3*NSHFTS-3) +*> U is REAL array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is REAL array, dimension (LDWV,3*NSHFTS-3) +*> WV is REAL array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup realOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS-1 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) + $ THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF -* -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE * - IF( ACCUM ) THEN +* ==== Perform update from left for subsequent +* . column. ==== * -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM* - $ V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE * -* ==== Horizontal Multiply ==== +* ==== Vertical multiply ==== * - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL SLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL SLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of SLAQR5 ==== * diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f index 2ee874dfd..e0fddd3a7 100644 --- a/lapack-netlib/SRC/zhseqr.f +++ b/lapack-netlib/SRC/zhseqr.f @@ -320,10 +320,10 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare ZLAHQR failure. NL > NTINY = 11 is +* . through a rare ZLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f index feffe9782..edf01bc7c 100644 --- a/lapack-netlib/SRC/zlaqr0.f +++ b/lapack-netlib/SRC/zlaqr0.f @@ -262,7 +262,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -357,22 +357,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -420,7 +420,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -560,7 +560,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAQR4 or * . ZLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f index a88f6508e..6d083fcda 100644 --- a/lapack-netlib/SRC/zlaqr4.f +++ b/lapack-netlib/SRC/zlaqr4.f @@ -268,7 +268,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -363,22 +363,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -426,7 +426,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -566,7 +566,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 9ff7e7eca..c12f4b780 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: ZLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: ZLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX*16 array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX*16 array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX*16 array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX*16 array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complex16OTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX*16 ALPHA, BETA, CDUM, REFSUM DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL ZLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = DCONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL ZLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL ZLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*DCONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL ZLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 70 CONTINUE * -* ==== Multiply H by reflections from the left ==== +* ==== Perform update from left for subsequent +* . column. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,23 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*DCONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 130 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = DCONJG( V( 1, M ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M ) )* + $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -688,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE * -* ==== Horizontal Multiply ==== +* ==== Vertical multiply ==== * - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL ZLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL ZTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL ZGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL ZLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of ZLAQR5 ==== * From 53ee0b76bb066b928b4288e2dbb6ca25c389ea86 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Apr 2021 18:01:14 -0700 Subject: [PATCH 02/10] x86: Enable Intel CET When Intel CET is enabled, we need to include in assembly codes to mark Intel CET support and place _CET_ENDBR at the function entry. --- common.h | 9 +++++++++ common_x86.h | 3 ++- common_x86_64.h | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/common.h b/common.h index 862e0b4db..ac795937c 100644 --- a/common.h +++ b/common.h @@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_alpha.h" #endif +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif +#ifndef _CET_ENDBR +#define _CET_ENDBR +#endif + #ifdef ARCH_X86 #include "common_x86.h" #endif diff --git a/common_x86.h b/common_x86.h index ec928e236..bc77eca58 100644 --- a/common_x86.h +++ b/common_x86.h @@ -340,7 +340,8 @@ REALNAME: .align 16; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call mcount diff --git a/common_x86_64.h b/common_x86_64.h index b813336c6..729a055ce 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -451,7 +451,8 @@ REALNAME: .align 512; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call *mcount@GOTPCREL(%rip) From 254774f5a621b7ecd3775ac0c436f6e47189b909 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 13:10:16 +0200 Subject: [PATCH 03/10] Add const qualifiers --- lapack-netlib/LAPACKE/include/lapack.h | 12 ++++++------ lapack-netlib/LAPACKE/include/lapacke.h | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 828d3279e..341efabda 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -4768,7 +4768,7 @@ void LAPACK_chegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, - lapack_complex_float* B, lapack_int const* ldb, + const lapack_complex_float* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST) @@ -4776,7 +4776,7 @@ void LAPACK_zhegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, - lapack_complex_double* B, lapack_int const* ldb, + const lapack_complex_double* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV) @@ -11556,7 +11556,7 @@ void LAPACK_zsytrs( void LAPACK_csytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_float* B, lapack_int const* ldb, lapack_complex_float* work, lapack_int* info ); @@ -11565,7 +11565,7 @@ void LAPACK_csytrs2( void LAPACK_dsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - double* A, lapack_int const* lda, lapack_int const* ipiv, + const double* A, lapack_int const* lda, lapack_int const* ipiv, double* B, lapack_int const* ldb, double* work, lapack_int* info ); @@ -11574,7 +11574,7 @@ void LAPACK_dsytrs2( void LAPACK_ssytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - float* A, lapack_int const* lda, lapack_int const* ipiv, + const float* A, lapack_int const* lda, lapack_int const* ipiv, float* B, lapack_int const* ldb, float* work, lapack_int* info ); @@ -11583,7 +11583,7 @@ void LAPACK_ssytrs2( void LAPACK_zsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_double* B, lapack_int const* ldb, lapack_complex_double* work, lapack_int* info ); diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 012c104bb..b280dde0a 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo, lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, @@ -6932,11 +6932,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range, lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz, @@ -10553,11 +10553,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb ); lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ); @@ -10718,10 +10718,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, double* work, lapack_int nb ); lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ); lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ); lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10813,10 +10813,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, float* work, lapack_int nb ); lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ); lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ); lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10898,11 +10898,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb ); lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ); From 5cc35abc3dfa3ae5c4466948b23cdd08ca366a1e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 13:22:10 +0200 Subject: [PATCH 04/10] Apply MKL team fixes to the LAPACKE interfaces (Reference-LAPACK PR 534) Removed spurious checks for INFO in xLACPY,xLASET after routines not returning any,and redundant requirements for ldvt in xGESVD_WORK --- lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_cheev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegst_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_chegvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_chegvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chetri2x.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_claset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_csyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_csytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ctrttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ctrttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cungtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cunmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_dorgtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsygst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsygv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dsygvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dsygvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dtrttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dtrttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_slaset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_sorgtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssygst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssygv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ssygvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ssygvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_strttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_strttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_zheev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zhegvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zhegvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_zsyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ztrttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ztrttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zungtr.c | 2 +- 75 files changed, 87 insertions(+), 103 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c index 558a7f308..4256c0f04 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c index aa78e678e..dbb2753d1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c index d26c84785..2f25c187a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c index e8f212efb..9e8a1c4db 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c index ff7dd3532..c628017c2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c index a29e01961..001863819 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv.c b/lapack-netlib/LAPACKE/src/lapacke_chegv.c index 15d052987..c01525662 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c index 537b9450b..fc3395833 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c index 98c901982..fe7b39cee 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c index 3ba62746e..d56e3ee46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c index 6937752c4..fc0d4e3d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c index 80d262626..eba359312 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c index 7b25815e7..1b4fed17a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c index 2eb942e4e..771395e97 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c index 44405c993..f4a0a4334 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c index 8567a07d5..d914c1d69 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c index fd0a40c17..8ca652456 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c index c4ea703af..7b2e3a169 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c index ddae70345..faa3ef6d3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index d9fb2dca0..71ad23f2f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c index 7dbc9bb88..671def1df 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c index f1a505486..88f4489a3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c index 4b59fe627..f1444b5e2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c index 86184b784..587805de6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index db75a6609..0b1c54b9b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c index cca9be489..36ff7c40c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c index f696c608f..78f9e80ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c index 6f9c02f6a..d68989aa6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c index 81ba2acb3..25d075d46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c index 800a30b24..69b90e758 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c index 533b6a446..4ece69794 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c index 974b63e54..0016a7d06 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c index 51f333359..0db0cfa67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c index 02d54d7fa..54fa6ff36 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_dsygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c index 4d73ef3c1..46c90190f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c index caffa5b4b..c937c39c5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c index 66d1e5a2c..de379a970 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_dtrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c index 89f01dc95..d17593471 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dtrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c index 9dc5509c9..941d83cad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c index e60167001..cdec2c967 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c index c89c9a6e1..4f2fa7b67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c index 90dc435c9..804b7f8ef 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_s_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 9f0e9fddf..6ffe144cc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_sge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c index 5fd0a78c5..ac41a354d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_ssyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c index abd62ddf3..1889a337c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c index d9fe47599..faadc92f1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c index bfbf49aee..434b52c01 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c index 7b97f472b..4fb55960c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_ssygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c index 8ec40d954..f139de1ab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c index a2eba6653..195fb1e54 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c index 5afe8d2de..e33ce2a7b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c index 1fe4e2c6c..8ffd9dc40 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_ssygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c index 19f447cd8..a95a71469 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c index 7d348b382..cf98f443d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttf.c b/lapack-netlib/LAPACKE/src/lapacke_strttf.c index fee7ab9ae..e3304fbe7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_strttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttp.c b/lapack-netlib/LAPACKE/src/lapacke_strttp.c index 6c4b84aa3..2df79eb05 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_strttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c index 2d7c2b6f3..da73cd479 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c index d4e93aed2..8b7aa3518 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c index fb33c3e2a..840c53876 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c index 5af2a1269..b8509e04f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c index 8c4a5c374..aa2d84d84 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c index 62fce1f27..f77894204 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c index 683fcf487..587e2d4be 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c index 0f1b415a9..43569d99e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c index 1242a0eda..c287595ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c index 492bc4dad..83f2bda2e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c @@ -61,7 +61,7 @@ lapack_int LAPACKE_zhegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c index a07bc8d52..15a8cc576 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_zhetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c index bb4e57b1e..fe36ed811 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c index 9056e8fca..ecb6cba25 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c index 2826efa53..074b15303 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_zsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c index 7442702aa..3c85f9796 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c index ec05ce6d5..cdc97fa02 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, - lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c index 8a5dfc271..8e8789ec6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c index 5dcf633bb..bd8485108 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c index 51785347e..adfaa7db9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_z_nancheck( n-1, tau, 1 ) ) { From 904b221f03a986be12b30bc21c872eaa79a6427e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 14:47:22 +0200 Subject: [PATCH 05/10] Add cast to prevent overflow of intermediate result --- interface/imatcopy.c | 4 ++-- interface/zimatcopy.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 93ffd69f9..91975f7f4 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); else - msize = (*ldb) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 87964e20d..ecda5ef4e 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) From 98ebc8ac5987af4ef44618d95e34ae122ec24c20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 14:48:19 +0200 Subject: [PATCH 06/10] Add casts to prevent overflow of intermediate result --- ctest/c_cblas2.c | 44 +++++++++++++++--------------- ctest/c_dblas2.c | 42 ++++++++++++++--------------- ctest/c_dblas3.c | 46 +++++++++++++++---------------- ctest/c_sblas2.c | 42 ++++++++++++++--------------- ctest/c_sblas3.c | 46 +++++++++++++++---------------- ctest/c_zblas2.c | 44 +++++++++++++++--------------- ctest/c_zblas3.c | 70 ++++++++++++++++++++++++------------------------ 7 files changed, 167 insertions(+), 167 deletions(-) diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c index 057096f32..6511e5271 100644 --- a/ctest/c_cblas2.c +++ b/ctest/c_cblas2.c @@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA; *incx, beta, y, *incy ); else { LDA = *k+2; - A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, beta, y, *incy); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); - AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn, x, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA=*n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha, cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap ); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, *incy, ap ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { @@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c index 423a58748..ae3854c0e 100644 --- a/ctest/c_dblas2.c +++ b/ctest/c_dblas2.c @@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) { for( j=0; j<*n; j++ ) @@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, if (*order == TEST_ROW_MJR) { LDA = *n; - A = ( double* )malloc( LDA*LDA*sizeof( double ) ); - AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i Date: Sat, 1 May 2021 21:31:13 +0200 Subject: [PATCH 07/10] Fix possible division by zero in xTGSJA (Reference-LAPACK PR502) --- lapack-netlib/SRC/ctgsja.f | 9 +++++---- lapack-netlib/SRC/dtgsja.f | 9 +++++---- lapack-netlib/SRC/stgsja.f | 9 +++++---- lapack-netlib/SRC/ztgsja.f | 9 +++++---- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/ctgsja.f b/lapack-netlib/SRC/ctgsja.f index 38a61068e..c96cbe022 100644 --- a/lapack-netlib/SRC/ctgsja.f +++ b/lapack-netlib/SRC/ctgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), @@ -424,7 +424,8 @@ $ SLARTG, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, CONJG, MAX, MIN, REAL + INTRINSIC ABS, CONJG, MAX, MIN, REAL, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = REAL( A( K+I, N-L+I ) ) B1 = REAL( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL CSSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) diff --git a/lapack-netlib/SRC/dtgsja.f b/lapack-netlib/SRC/dtgsja.f index 66f32b790..537bd3f4f 100644 --- a/lapack-netlib/SRC/dtgsja.f +++ b/lapack-netlib/SRC/dtgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ DSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/stgsja.f b/lapack-netlib/SRC/stgsja.f index 2a6fc354d..7324da431 100644 --- a/lapack-netlib/SRC/stgsja.f +++ b/lapack-netlib/SRC/stgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ SSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/ztgsja.f b/lapack-netlib/SRC/ztgsja.f index 851f6504a..c80e33158 100644 --- a/lapack-netlib/SRC/ztgsja.f +++ b/lapack-netlib/SRC/ztgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), @@ -424,7 +424,8 @@ $ ZLASET, ZROT * .. * .. Intrinsic Functions .. - INTRINSIC ABS, DBLE, DCONJG, MAX, MIN + INTRINSIC ABS, DBLE, DCONJG, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = DBLE( A( K+I, N-L+I ) ) B1 = DBLE( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL ZDSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) From d77d9bc920affa2f2d3e0a5479cb05c676a9246e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 11:24:50 +0200 Subject: [PATCH 08/10] Handle norm NaN value (Reference LAPACK PR471) --- lapack-netlib/SRC/cgesdd.f | 8 ++++++-- lapack-netlib/SRC/dgesdd.f | 8 ++++++-- lapack-netlib/SRC/sgesdd.f | 8 ++++++-- lapack-netlib/SRC/zgesdd.f | 8 ++++++-- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f index 07341593f..34a80beea 100644 --- a/lapack-netlib/SRC/cgesdd.f +++ b/lapack-netlib/SRC/cgesdd.f @@ -281,9 +281,9 @@ $ CUNGQR, CUNMBR, SBDSDC, SLASCL, XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, CLANGE - EXTERNAL LSAME, SLAMCH, CLANGE + EXTERNAL LSAME, SLAMCH, CLANGE, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = CLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN ( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 0218900d2..80d18041c 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL DLAMCH, DLANGE, LSAME + EXTERNAL DLAMCH, DLANGE, LSAME, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = DLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index 689494dd1..89e03a002 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, SLANGE - EXTERNAL SLAMCH, SLANGE, LSAME + EXTERNAL SLAMCH, SLANGE, LSAME, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = SLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f index bb9d2c26e..2209f4733 100644 --- a/lapack-netlib/SRC/zgesdd.f +++ b/lapack-netlib/SRC/zgesdd.f @@ -281,9 +281,9 @@ $ ZLASET, ZUNGBR, ZUNGLQ, ZUNGQR, ZUNMBR * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = ZLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 From c26780d4510447ca101bfc44a3ad87018a3e9d8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 11:40:56 +0200 Subject: [PATCH 09/10] Initialize X and Y to zero for N=0 (Reference-LAPACK PR463) --- lapack-netlib/SRC/cggglm.f | 11 +++++++++-- lapack-netlib/SRC/dggglm.f | 11 +++++++++-- lapack-netlib/SRC/sggglm.f | 11 +++++++++-- lapack-netlib/SRC/zggglm.f | 11 +++++++++-- 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 336f41909..9c8e0eec3 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index 2e92912e0..1fbdc8add 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index fe63da5f5..572ee511d 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index d6a30cee7..d4adc5c4d 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * From 4bf00da8fbd85a3478086c822e8df4606ecefdc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 12:18:17 +0200 Subject: [PATCH 10/10] Avoid allocating the transposed triangular matrix (Reference-LAPACK PR382) --- .../LAPACKE/src/lapacke_clantr_work.c | 39 ++++++++++--------- .../LAPACKE/src/lapacke_dlantr_work.c | 38 +++++++++--------- .../LAPACKE/src/lapacke_slantr_work.c | 38 +++++++++--------- .../LAPACKE/src/lapacke_zlantr_work.c | 39 ++++++++++--------- 4 files changed, 80 insertions(+), 74 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c index 8c4c21935..4779f10d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c @@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_clantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_clantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c index 5b2a6c535..9c9b0ea8b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c @@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c index e1d4c270d..f77abef2c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c @@ -40,44 +40,46 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_slantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_slantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_slantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c index e62f8a4e3..cccc4053e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c @@ -41,45 +41,46 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_zlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); } } else {