Merge pull request #88 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-09-22 23:15:33 +02:00 committed by GitHub
commit 61fae59298
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 228 additions and 205 deletions

View File

@ -204,6 +204,17 @@ matrix:
env: env:
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
- <<: *test-macos
osx_image: xcode12
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
- brew install gcc@10 # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
- <<: *test-macos - <<: *test-macos
osx_image: xcode10.0 osx_image: xcode10.0
env: env:

View File

@ -12,5 +12,5 @@ endif
# Enable floating-point expression contraction for clang, since it is the # Enable floating-point expression contraction for clang, since it is the
# default for gcc # default for gcc
ifeq ($(C_COMPILER), CLANG) ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -ffp-contract=fast CCOMMON_OPT += -ffp-contract=on
endif endif

View File

@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" /*
* Avoid contraction of floating point operations, specifically fused
* multiply-add, because they can cause unexpected results in complex
* multiplication.
*/
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC optimize ("fp-contract=off")
#endif
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { #if defined(__clang__)
__asm__("vlrepf %%v0,0(%[alpha])\n\t" #pragma clang fp contract(off)
"vlef %%v1,4(%[alpha]),0\n\t" #endif
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t" #include "common.h"
"vlef %%v1,4(%[alpha]),1\n\t" #include "vector-common.h"
"vlef %%v1,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t" static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) {
"xgr %%r1,%%r1\n\t" vector_float da_r_vec = vec_splats(da_r);
"0:\n\t" vector_float da_i_vec = { -da_i, da_i, -da_i, da_i };
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t" vector_float *x_vec_ptr = (vector_float *)x;
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t" #pragma GCC unroll 16
"vl %%v19,48(%%r1,%[x])\n\t" for (size_t i = 0; i < n/2; i++) {
"vl %%v20,64(%%r1,%[x])\n\t" vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS);
"vl %%v21,80(%%r1,%[x])\n\t" vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]};
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t" x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec;
"verllg %%v24,%%v16,32\n\t" }
"verllg %%v25,%%v17,32\n\t"
"verllg %%v26,%%v18,32\n\t"
"verllg %%v27,%%v19,32\n\t"
"verllg %%v28,%%v20,32\n\t"
"verllg %%v29,%%v21,32\n\t"
"verllg %%v30,%%v22,32\n\t"
"verllg %%v31,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
: "cc", "r1", "v0"); : "cc", "r1", "v0");
} }
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x,
BLASLONG inc_x) { BLASLONG inc_x) {
BLASLONG i; BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x; BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3; FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) { for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1]; t0 = da_r * x[0] - da_i * x[1];
@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if (n1 > 0) { if (n1 > 0) {
alpha[0] = da_r; cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x);
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1; j = n1;
i = n1 * inc_x; i = n1 * inc_x;
} }
@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
else if (da_i == 0) else if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x); cscal_kernel_16_zero_i(n1, alpha, x);
else else
cscal_kernel_16(n1, alpha, x); cscal_kernel_16(n1, da_r, da_i, x);
i = n1 << 1; i = n1 << 1;
j = n1; j = n1;

View File

@ -30,12 +30,13 @@
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "common.h" #include "common.h"
#include <vecintrin.h> #include "vector-common.h"
#include <stdbool.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef COMPLEX #ifdef COMPLEX
#error "Handling for complex numbers is not supported in this kernel" #error "Handling for complex numbers is not supported in this kernel"
#endif #endif
@ -153,37 +154,6 @@ static const bool backwards = false;
* 3, May 2008. * 3, May 2008.
*/ */
#define VLEN_BYTES 16
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
typedef FLOAT vector_float __attribute__ ((vector_size (16)));
/**
* Load a vector into register, and hint on 8-byte alignment to improve
* performance. gcc-9 and newer will create these hints by itself. For older
* compiler versions, use inline assembly to explicitly express the hint.
* Provide explicit hex encoding to cater for binutils versions that do not know
* about vector-load with alignment hints yet.
*
* Note that, for block sizes where we apply vectorization, vectors in A will
* always be 8-byte aligned.
*/
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
vector_float const *restrict addr = (vector_float const *restrict)a;
vector_float y;
#if __GNUC__ < 9 && !defined(__clang__)
// hex-encode vl %[out],%[addr],3
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
: [ out ] "=v"(y)
: [ addr ] "R"(*addr));
#else
y = *addr;
#endif
return y;
}
/** /**
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
* *

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) IBM Corporation 2020.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <vecintrin.h>
#define VLEN_BYTES 16
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
typedef FLOAT vector_float __attribute__ ((vector_size (VLEN_BYTES)));
/**
* Load a vector into register, and hint on 8-byte alignment to improve
* performance. gcc-9 and newer will create these hints by itself. For older
* compiler versions, use inline assembly to explicitly express the hint.
* Provide explicit hex encoding to cater for binutils versions that do not know
* about vector-load with alignment hints yet.
*
* Note that, for block sizes where we apply vectorization, vectors in A will
* always be 8-byte aligned.
*/
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
vector_float const *restrict addr = (vector_float const *restrict)a;
vector_float y;
#if __GNUC__ < 9 && !defined(__clang__)
// hex-encode vl %[out],%[addr],3
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
: [ out ] "=v"(y)
: [ addr ] "R"(*addr));
#else
y = *addr;
#endif
return y;
}

View File

@ -25,65 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" /*
* Avoid contraction of floating point operations, specifically fused
* multiply-add, because they can cause unexpected results in complex
* multiplication.
*/
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC optimize ("fp-contract=off")
#endif
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { #if defined(__clang__)
__asm__("vlrepg %%v0,0(%[alpha])\n\t" #pragma clang fp contract(off)
"vleg %%v1,8(%[alpha]),0\n\t" #endif
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t" #include "common.h"
"srlg %[n],%[n],3\n\t" #include "vector-common.h"
"xgr %%r1,%%r1\n\t"
"0:\n\t" static void zscal_kernel_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) {
"pfd 2, 1024(%%r1,%[x])\n\t" vector_float da_r_vec = vec_splats(da_r);
"vl %%v16,0(%%r1,%[x])\n\t" vector_float da_i_vec = { -da_i, da_i };
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t" vector_float * x_vec_ptr = (vector_float *)x;
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t" #pragma GCC unroll 16
"vl %%v21,80(%%r1,%[x])\n\t" for (size_t i = 0; i < n; i++) {
"vl %%v22,96(%%r1,%[x])\n\t" vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS);
"vl %%v23,112(%%r1,%[x])\n\t" vector_float x_swapped = {x_vec[1], x_vec[0]};
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t" x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec;
"vpdi %%v26,%%v18,%%v18,4\n\t" }
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vpdi %%v28,%%v20,%%v20,4\n\t"
"vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v31,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
@ -195,14 +165,12 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
: "cc", "r1", "v0"); : "cc", "r1", "v0");
} }
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x,
BLASLONG inc_x) { BLASLONG inc_x) {
BLASLONG i; BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x; BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3; FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) { for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1]; t0 = da_r * x[0] - da_i * x[1];
@ -320,9 +288,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if (n1 > 0) { if (n1 > 0) {
alpha[0] = da_r; zscal_kernel_inc_8(n1, da_r, da_i, x, inc_x);
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1; j = n1;
i = n1 * inc_x; i = n1 * inc_x;
} }
@ -358,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
else if (da_i == 0) else if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x); zscal_kernel_8_zero_i(n1, alpha, x);
else else
zscal_kernel_8(n1, alpha, x); zscal_kernel_8(n1, da_r, da_i, x);
i = n1 << 1; i = n1 << 1;
j = n1; j = n1;

View File

@ -26,7 +26,7 @@
*> where: *> where:
*> *>
*> Q is a N-by-N orthogonal matrix; *> Q is a N-by-N orthogonal matrix;
*> L is an lower-triangular M-by-M matrix; *> L is a lower-triangular M-by-M matrix;
*> 0 is a M-by-(N-M) zero matrix, if M < N. *> 0 is a M-by-(N-M) zero matrix, if M < N.
*> *>
*> \endverbatim *> \endverbatim
@ -187,7 +187,7 @@
* .. * ..
* .. Local Scalars .. * .. Local Scalars ..
LOGICAL LQUERY, LMINWS, MINT, MINW LOGICAL LQUERY, LMINWS, MINT, MINW
INTEGER MB, NB, MINTSZ, NBLCKS INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ
* .. * ..
* .. External Functions .. * .. External Functions ..
LOGICAL LSAME LOGICAL LSAME
@ -243,20 +243,32 @@
* *
* Determine if the workspace size satisfies minimal size * Determine if the workspace size satisfies minimal size
* *
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWMIN = MAX( 1, N )
LWOPT = MAX( 1, MB*N )
ELSE
LWMIN = MAX( 1, M )
LWOPT = MAX( 1, MB*M )
END IF
LMINWS = .FALSE. LMINWS = .FALSE.
IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT )
$ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ )
$ .AND. ( .NOT.LQUERY ) ) THEN $ .AND. ( .NOT.LQUERY ) ) THEN
IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
NB = N NB = N
END IF END IF
IF( LWORK.LT.MB*M ) THEN IF( LWORK.LT.LWOPT ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
END IF END IF
END IF END IF
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWREQ = MAX( 1, MB*N )
ELSE
LWREQ = MAX( 1, MB*M )
END IF
* *
IF( M.LT.0 ) THEN IF( M.LT.0 ) THEN
INFO = -1 INFO = -1
@ -267,7 +279,7 @@
ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 )
$ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
INFO = -6 INFO = -6
ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY )
$ .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LMINWS ) ) THEN
INFO = -8 INFO = -8
END IF END IF
@ -281,9 +293,9 @@
T( 2 ) = MB T( 2 ) = MB
T( 3 ) = NB T( 3 ) = NB
IF( MINW ) THEN IF( MINW ) THEN
WORK( 1 ) = MAX( 1, N ) WORK( 1 ) = LWMIN
ELSE ELSE
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
END IF END IF
END IF END IF
IF( INFO.NE.0 ) THEN IF( INFO.NE.0 ) THEN
@ -308,7 +320,7 @@
$ LWORK, INFO ) $ LWORK, INFO )
END IF END IF
* *
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
* *
RETURN RETURN
* *

View File

@ -261,7 +261,7 @@
TSZM = INT( TQ( 1 ) ) TSZM = INT( TQ( 1 ) )
LWM = INT( WORKQ( 1 ) ) LWM = INT( WORKQ( 1 ) )
CALL CGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, CALL CGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ,
$ TSZO, B, LDB, WORKQ, -1, INFO2 ) $ TSZM, B, LDB, WORKQ, -1, INFO2 )
LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) )
WSIZEO = TSZO + LWO WSIZEO = TSZO + LWO
WSIZEM = TSZM + LWM WSIZEM = TSZM + LWM

View File

@ -26,7 +26,7 @@
*> where: *> where:
*> *>
*> Q is a N-by-N orthogonal matrix; *> Q is a N-by-N orthogonal matrix;
*> L is an lower-triangular M-by-M matrix; *> L is a lower-triangular M-by-M matrix;
*> 0 is a M-by-(N-M) zero matrix, if M < N. *> 0 is a M-by-(N-M) zero matrix, if M < N.
*> *>
*> \endverbatim *> \endverbatim
@ -187,7 +187,7 @@
* .. * ..
* .. Local Scalars .. * .. Local Scalars ..
LOGICAL LQUERY, LMINWS, MINT, MINW LOGICAL LQUERY, LMINWS, MINT, MINW
INTEGER MB, NB, MINTSZ, NBLCKS INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ
* .. * ..
* .. External Functions .. * .. External Functions ..
LOGICAL LSAME LOGICAL LSAME
@ -243,20 +243,32 @@
* *
* Determine if the workspace size satisfies minimal size * Determine if the workspace size satisfies minimal size
* *
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWMIN = MAX( 1, N )
LWOPT = MAX( 1, MB*N )
ELSE
LWMIN = MAX( 1, M )
LWOPT = MAX( 1, MB*M )
END IF
LMINWS = .FALSE. LMINWS = .FALSE.
IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT )
$ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ )
$ .AND. ( .NOT.LQUERY ) ) THEN $ .AND. ( .NOT.LQUERY ) ) THEN
IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
NB = N NB = N
END IF END IF
IF( LWORK.LT.MB*M ) THEN IF( LWORK.LT.LWOPT ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
END IF END IF
END IF END IF
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWREQ = MAX( 1, MB*N )
ELSE
LWREQ = MAX( 1, MB*M )
END IF
* *
IF( M.LT.0 ) THEN IF( M.LT.0 ) THEN
INFO = -1 INFO = -1
@ -267,7 +279,7 @@
ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 )
$ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
INFO = -6 INFO = -6
ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY )
$ .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LMINWS ) ) THEN
INFO = -8 INFO = -8
END IF END IF
@ -281,9 +293,9 @@
T( 2 ) = MB T( 2 ) = MB
T( 3 ) = NB T( 3 ) = NB
IF( MINW ) THEN IF( MINW ) THEN
WORK( 1 ) = MAX( 1, N ) WORK( 1 ) = LWMIN
ELSE ELSE
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
END IF END IF
END IF END IF
IF( INFO.NE.0 ) THEN IF( INFO.NE.0 ) THEN
@ -308,7 +320,7 @@
$ LWORK, INFO ) $ LWORK, INFO )
END IF END IF
* *
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
* *
RETURN RETURN
* *

View File

@ -258,7 +258,7 @@
TSZM = INT( TQ( 1 ) ) TSZM = INT( TQ( 1 ) )
LWM = INT( WORKQ( 1 ) ) LWM = INT( WORKQ( 1 ) )
CALL DGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, CALL DGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ,
$ TSZO, B, LDB, WORKQ, -1, INFO2 ) $ TSZM, B, LDB, WORKQ, -1, INFO2 )
LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) )
WSIZEO = TSZO + LWO WSIZEO = TSZO + LWO
WSIZEM = TSZM + LWM WSIZEM = TSZM + LWM

View File

@ -26,7 +26,7 @@
*> where: *> where:
*> *>
*> Q is a N-by-N orthogonal matrix; *> Q is a N-by-N orthogonal matrix;
*> L is an lower-triangular M-by-M matrix; *> L is a lower-triangular M-by-M matrix;
*> 0 is a M-by-(N-M) zero matrix, if M < N. *> 0 is a M-by-(N-M) zero matrix, if M < N.
*> *>
*> \endverbatim *> \endverbatim
@ -187,7 +187,7 @@
* .. * ..
* .. Local Scalars .. * .. Local Scalars ..
LOGICAL LQUERY, LMINWS, MINT, MINW LOGICAL LQUERY, LMINWS, MINT, MINW
INTEGER MB, NB, MINTSZ, NBLCKS INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ
* .. * ..
* .. External Functions .. * .. External Functions ..
LOGICAL LSAME LOGICAL LSAME
@ -243,20 +243,32 @@
* *
* Determine if the workspace size satisfies minimal size * Determine if the workspace size satisfies minimal size
* *
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWMIN = MAX( 1, N )
LWOPT = MAX( 1, MB*N )
ELSE
LWMIN = MAX( 1, M )
LWOPT = MAX( 1, MB*M )
END IF
LMINWS = .FALSE. LMINWS = .FALSE.
IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT )
$ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ )
$ .AND. ( .NOT.LQUERY ) ) THEN $ .AND. ( .NOT.LQUERY ) ) THEN
IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
NB = N NB = N
END IF END IF
IF( LWORK.LT.MB*M ) THEN IF( LWORK.LT.LWOPT ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
END IF END IF
END IF END IF
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWREQ = MAX( 1, MB*N )
ELSE
LWREQ = MAX( 1, MB*M )
END IF
* *
IF( M.LT.0 ) THEN IF( M.LT.0 ) THEN
INFO = -1 INFO = -1
@ -267,7 +279,7 @@
ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 )
$ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
INFO = -6 INFO = -6
ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY )
$ .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LMINWS ) ) THEN
INFO = -8 INFO = -8
END IF END IF
@ -281,9 +293,9 @@
T( 2 ) = MB T( 2 ) = MB
T( 3 ) = NB T( 3 ) = NB
IF( MINW ) THEN IF( MINW ) THEN
WORK( 1 ) = MAX( 1, N ) WORK( 1 ) = LWMIN
ELSE ELSE
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
END IF END IF
END IF END IF
IF( INFO.NE.0 ) THEN IF( INFO.NE.0 ) THEN
@ -308,7 +320,7 @@
$ LWORK, INFO ) $ LWORK, INFO )
END IF END IF
* *
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
RETURN RETURN
* *
* End of SGELQ * End of SGELQ

View File

@ -258,7 +258,7 @@
TSZM = INT( TQ( 1 ) ) TSZM = INT( TQ( 1 ) )
LWM = INT( WORKQ( 1 ) ) LWM = INT( WORKQ( 1 ) )
CALL SGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, CALL SGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ,
$ TSZO, B, LDB, WORKQ, -1, INFO2 ) $ TSZM, B, LDB, WORKQ, -1, INFO2 )
LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) )
WSIZEO = TSZO + LWO WSIZEO = TSZO + LWO
WSIZEM = TSZM + LWM WSIZEM = TSZM + LWM

View File

@ -26,7 +26,7 @@
*> where: *> where:
*> *>
*> Q is a N-by-N orthogonal matrix; *> Q is a N-by-N orthogonal matrix;
*> L is an lower-triangular M-by-M matrix; *> L is a lower-triangular M-by-M matrix;
*> 0 is a M-by-(N-M) zero matrix, if M < N. *> 0 is a M-by-(N-M) zero matrix, if M < N.
*> *>
*> \endverbatim *> \endverbatim
@ -187,7 +187,7 @@
* .. * ..
* .. Local Scalars .. * .. Local Scalars ..
LOGICAL LQUERY, LMINWS, MINT, MINW LOGICAL LQUERY, LMINWS, MINT, MINW
INTEGER MB, NB, MINTSZ, NBLCKS INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ
* .. * ..
* .. External Functions .. * .. External Functions ..
LOGICAL LSAME LOGICAL LSAME
@ -243,20 +243,32 @@
* *
* Determine if the workspace size satisfies minimal size * Determine if the workspace size satisfies minimal size
* *
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWMIN = MAX( 1, N )
LWOPT = MAX( 1, MB*N )
ELSE
LWMIN = MAX( 1, M )
LWOPT = MAX( 1, MB*M )
END IF
LMINWS = .FALSE. LMINWS = .FALSE.
IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT )
$ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ )
$ .AND. ( .NOT.LQUERY ) ) THEN $ .AND. ( .NOT.LQUERY ) ) THEN
IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
NB = N NB = N
END IF END IF
IF( LWORK.LT.MB*M ) THEN IF( LWORK.LT.LWOPT ) THEN
LMINWS = .TRUE. LMINWS = .TRUE.
MB = 1 MB = 1
END IF END IF
END IF END IF
IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN
LWREQ = MAX( 1, MB*N )
ELSE
LWREQ = MAX( 1, MB*M )
END IF
* *
IF( M.LT.0 ) THEN IF( M.LT.0 ) THEN
INFO = -1 INFO = -1
@ -267,7 +279,7 @@
ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 )
$ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
INFO = -6 INFO = -6
ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY )
$ .AND. ( .NOT.LMINWS ) ) THEN $ .AND. ( .NOT.LMINWS ) ) THEN
INFO = -8 INFO = -8
END IF END IF
@ -281,9 +293,9 @@
T( 2 ) = MB T( 2 ) = MB
T( 3 ) = NB T( 3 ) = NB
IF( MINW ) THEN IF( MINW ) THEN
WORK( 1 ) = MAX( 1, N ) WORK( 1 ) = LWMIN
ELSE ELSE
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
END IF END IF
END IF END IF
IF( INFO.NE.0 ) THEN IF( INFO.NE.0 ) THEN
@ -308,7 +320,7 @@
$ LWORK, INFO ) $ LWORK, INFO )
END IF END IF
* *
WORK( 1 ) = MAX( 1, MB*M ) WORK( 1 ) = LWREQ
* *
RETURN RETURN
* *

View File

@ -261,7 +261,7 @@
TSZM = INT( TQ( 1 ) ) TSZM = INT( TQ( 1 ) )
LWM = INT( WORKQ( 1 ) ) LWM = INT( WORKQ( 1 ) )
CALL ZGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, CALL ZGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ,
$ TSZO, B, LDB, WORKQ, -1, INFO2 ) $ TSZM, B, LDB, WORKQ, -1, INFO2 )
LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) )
WSIZEO = TSZO + LWO WSIZEO = TSZO + LWO
WSIZEM = TSZM + LWM WSIZEM = TSZM + LWM