diff --git a/.travis.yml b/.travis.yml index 3f8f766fe..482b4f648 100644 --- a/.travis.yml +++ b/.travis.yml @@ -204,6 +204,17 @@ matrix: env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + - <<: *test-macos + osx_image: xcode12 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - <<: *test-macos osx_image: xcode10.0 env: diff --git a/Makefile.zarch b/Makefile.zarch index b841d9b4d..092ca2589 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -12,5 +12,5 @@ endif # Enable floating-point expression contraction for clang, since it is the # default for gcc ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -ffp-contract=fast +CCOMMON_OPT += -ffp-contract=on endif diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index f9e89a452..57bb89c0a 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include "common.h" +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + +#include "common.h" +#include "vector-common.h" + +static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { + vector_float da_r_vec = vec_splats(da_r); + vector_float da_i_vec = { -da_i, da_i, -da_i, da_i }; + + vector_float *x_vec_ptr = (vector_float *)x; + +#pragma GCC unroll 16 + for (size_t i = 0; i < n/2; i++) { + vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); + vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]}; + + x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; + } } static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { @@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { : "cc", "r1", "v0"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, +static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); + cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_i == 0) cscal_kernel_16_zero_i(n1, alpha, x); else - cscal_kernel_16(n1, alpha, x); + cscal_kernel_16(n1, da_r, da_i, x); i = n1 << 1; j = n1; diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index ef0b1d1e3..30f3171d2 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -30,12 +30,13 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common.h" -#include +#include "vector-common.h" #include #include #include + #ifdef COMPLEX #error "Handling for complex numbers is not supported in this kernel" #endif @@ -153,37 +154,6 @@ static const bool backwards = false; * 3, May 2008. */ -#define VLEN_BYTES 16 -#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) - -typedef FLOAT vector_float __attribute__ ((vector_size (16))); - -/** - * Load a vector into register, and hint on 8-byte alignment to improve - * performance. gcc-9 and newer will create these hints by itself. For older - * compiler versions, use inline assembly to explicitly express the hint. - * Provide explicit hex encoding to cater for binutils versions that do not know - * about vector-load with alignment hints yet. - * - * Note that, for block sizes where we apply vectorization, vectors in A will - * always be 8-byte aligned. - */ -static inline vector_float vec_load_hinted(FLOAT const *restrict a) { - vector_float const *restrict addr = (vector_float const *restrict)a; - vector_float y; - -#if __GNUC__ < 9 && !defined(__clang__) - // hex-encode vl %[out],%[addr],3 - asm(".insn vrx,0xe70000003006,%[out],%[addr],3" - : [ out ] "=v"(y) - : [ addr ] "R"(*addr)); -#else - y = *addr; -#endif - - return y; -} - /** * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * diff --git a/kernel/zarch/vector-common.h b/kernel/zarch/vector-common.h new file mode 100644 index 000000000..140d39d7b --- /dev/null +++ b/kernel/zarch/vector-common.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (VLEN_BYTES))); + +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 && !defined(__clang__) + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index a5a8f694d..d39b8447e 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -25,65 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include "common.h" +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + +#include "common.h" +#include "vector-common.h" + +static void zscal_kernel_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { + vector_float da_r_vec = vec_splats(da_r); + vector_float da_i_vec = { -da_i, da_i }; + + vector_float * x_vec_ptr = (vector_float *)x; + +#pragma GCC unroll 16 + for (size_t i = 0; i < n; i++) { + vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); + vector_float x_swapped = {x_vec[1], x_vec[0]}; + + x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; + } } static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { @@ -195,14 +165,12 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { : "cc", "r1", "v0"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, +static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -320,9 +288,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); + zscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -358,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); i = n1 << 1; j = n1; diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index c3b2238bf..f0ff3a20d 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index 3d783be66..01de3c984 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -261,7 +261,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL CGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index fc14d892f..7b2f80862 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index dfc72c8b2..c2ba5e2b8 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -258,7 +258,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL DGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 96c4097e8..e45c68db4 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index 53d2f9431..3bf084515 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -258,7 +258,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL SGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index 4e7e7e38e..beb054b87 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 1aab3c662..11233785b 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -261,7 +261,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL ZGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM