From 6f09e4c1212db6f33407efd8bb4588335c626fee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 19:37:28 +0100 Subject: [PATCH 1/2] Improve FMA usage in ?LAQR5 (Reference-LAPACK PR681) --- lapack-netlib/SRC/claqr5.f | 96 ++++++++++++++++++---------------- lapack-netlib/SRC/dlaqr5.f | 102 +++++++++++++++++++++---------------- lapack-netlib/SRC/slaqr5.f | 102 +++++++++++++++++++++---------------- lapack-netlib/SRC/zlaqr5.f | 97 +++++++++++++++++++---------------- 4 files changed, 223 insertions(+), 174 deletions(-) diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index 95cc33b9d..0a01cc226 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -279,7 +279,7 @@ PARAMETER ( RZERO = 0.0e0, RONE = 1.0e0 ) * .. * .. Local Scalars .. - COMPLEX ALPHA, BETA, CDUM, REFSUM + COMPLEX ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*CONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ ELSE JBOT = KBOT END IF + T1 = CONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ CONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -610,25 +611,28 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + CONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +692,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )* - $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )* + $ H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +718,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +737,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index 0c63ab800..43b4ac72a 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -286,8 +286,8 @@ * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -631,22 +635,25 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +713,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +739,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +758,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index b9bae9376..a4f805674 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -286,8 +286,8 @@ * .. * .. Local Scalars .. REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -631,22 +635,25 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +713,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +739,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +758,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 3185508bc..4fa5ee5b0 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -279,7 +279,7 @@ PARAMETER ( RZERO = 0.0d0, RONE = 1.0d0 ) * .. * .. Local Scalars .. - COMPLEX*16 ALPHA, BETA, CDUM, REFSUM + COMPLEX*16 ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*DCONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ ELSE JBOT = KBOT END IF + T1 = DCONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ DCONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -610,25 +611,29 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + $ + DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + DCONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +693,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + DCONJG( V( 2, M ) )*H( K+2, J ) + $ + DCONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +719,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +738,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF From c6816bb5760827fb073fd65db49ee2178933e20d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 19:39:12 +0100 Subject: [PATCH 2/2] Use normwise criterion in multishift QZ (Reference-LAPACK PR698) --- lapack-netlib/SRC/claqz0.f | 16 ++++++---------- lapack-netlib/SRC/dlaqz0.f | 16 ++++++---------- lapack-netlib/SRC/slaqz0.f | 17 +++++++---------- lapack-netlib/SRC/zlaqz0.f | 17 +++++++---------- 4 files changed, 26 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f index 2284fd65d..9cc25c6dc 100644 --- a/lapack-netlib/SRC/claqz0.f +++ b/lapack-netlib/SRC/claqz0.f @@ -299,7 +299,7 @@ PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, BNORM, BTOL COMPLEX :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -312,7 +312,7 @@ * External Functions EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD, $ CLARTG, CROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, CLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -466,6 +466,9 @@ ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = CLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -528,15 +531,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f index 1bf65fd60..5b0965406 100644 --- a/lapack-netlib/SRC/dlaqz0.f +++ b/lapack-netlib/SRC/dlaqz0.f @@ -322,7 +322,7 @@ * Local scalars DOUBLE PRECISION :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, - $ TEMP, SWAP + $ TEMP, SWAP, BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -334,7 +334,7 @@ * External Functions EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD, $ DLARTG, DROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -486,6 +486,9 @@ ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = DLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -562,15 +565,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 15913be88..69f402914 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -318,7 +318,8 @@ PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP + REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP, + $ BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -330,7 +331,7 @@ * External Functions EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD, $ SLARTG, SROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, SLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -482,6 +483,9 @@ ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = SLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -558,15 +562,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f index 2616f20b5..0d8884ed5 100644 --- a/lapack-netlib/SRC/zlaqz0.f +++ b/lapack-netlib/SRC/zlaqz0.f @@ -300,7 +300,8 @@ PARAMETER( ZERO = 0.0D0, ONE = 1.0D0, HALF = 0.5D0 ) * Local scalars - DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, + $ BNORM, BTOL COMPLEX*16 :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -313,7 +314,7 @@ * External Functions EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD, $ ZLARTG, ZROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -467,6 +468,9 @@ ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = ZLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -529,15 +533,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it