diff --git a/Makefile.arm64 b/Makefile.arm64 index 1091edfe5..62a877fff 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -66,6 +66,11 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif +ifeq ($(CORE), VORTEX) +CCOMMON_OPT += -march=armv8.3-a +FCOMMON_OPT += -march=armv8.3-a +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/TargetList.txt b/TargetList.txt index 5934f3012..66eca4506 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -98,6 +98,7 @@ THUNDERX THUNDERX2T99 TSV110 THUNDERX3T110 +VORTEX 9.System Z: ZARCH_GENERIC diff --git a/benchmark/asum.c b/benchmark/asum.c index 78ccdf47b..e3d16acfd 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -128,8 +128,13 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; double time1,timeg; +#else + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; +#endif argc--;argv++; @@ -160,26 +165,30 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); - for (l=0; l1) timeg /= loops; #ifdef COMPLEX diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 1fd43148a..a0d3e15b9 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,6 +26,11 @@ *****************************************************************************/ #include +#ifdef OS_DARWIN +#include +int32_t value; +size_t length=sizeof(value); +#endif #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 @@ -45,6 +50,8 @@ #define CPU_TSV110 9 // Ampere #define CPU_EMAG8180 10 +// Apple +#define CPU_VORTEX 13 static char *cpuname[] = { "UNKNOWN", @@ -59,7 +66,8 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", - "THUNDERX3T110" + "THUNDERX3T110", + "VORTEX" }; static char *cpuname_lower[] = { @@ -75,7 +83,8 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", - "thunderx3t110" + "thunderx3t110", + "vortex" }; int get_feature(char *search) @@ -198,6 +207,10 @@ int detect(void) } #else +#ifdef DARWIN + sysctlbyname("hw.cpufamily",&value,&length,NULL,0); + if (value ==131287967) return CPU_VORTEX; +#endif return CPU_ARMV8; #endif @@ -247,7 +260,10 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif - +#ifdef DARWIN + sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); + printf("#define NUM_CORES %d\n",value); +#endif } @@ -398,6 +414,19 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; +#ifdef DARWIN + case CPU_VORTEX: + printf("#define VORTEX \n"); + sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); + printf("#define L1_CODE_SIZE %d \n",value); + sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); + printf("#define L1_CODE_LINESIZE %d \n",value); + sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); + printf("#define L1_DATA_SIZE %d \n",value); + sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); + printf("#define L2_DATA_SIZE %d \n",value); + break; +#endif } get_cpucount(); } diff --git a/kernel/arm64/KERNEL.VORTEX b/kernel/arm64/KERNEL.VORTEX new file mode 100644 index 000000000..e3efef1f5 --- /dev/null +++ b/kernel/arm64/KERNEL.VORTEX @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8 diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ef8b36a57..b979fc0ae 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c new file mode 100644 index 000000000..8a40ea4b9 --- /dev/null +++ b/kernel/x86_64/dasum.c @@ -0,0 +1,82 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "dasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "dasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_DASUM_KERNEL +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + } + + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) { + sumf = dasum_kernel(n, x); + } + else { + n *= inc_x; + + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + } + return(sumf); +} + diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c new file mode 100644 index 000000000..4fc73ddd4 --- /dev/null +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -0,0 +1,86 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) + +#define HAVE_DASUM_KERNEL + +#include +#include + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3; + + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); + + if (n >= 256) { + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (i = 0; i < tail_index_AVX2; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + sumf += half_accum0[0]; + } + + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); + + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } + + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); + + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; + +} +#endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c new file mode 100644 index 000000000..aea8c02d9 --- /dev/null +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -0,0 +1,80 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DASUM_KERNEL 1 + +#include + +#include + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_512; + x1 += align_512; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); + + // + if ( n >= 256 ) { + + __m512d accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + for (i = 0; i < tail_index_AVX512; i += 32) { + accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0])); + accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8])); + accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16])); + accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24])); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_pd(accum_0); + } + + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); + + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } + + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); + + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; +} +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c new file mode 100644 index 000000000..36ec4a737 --- /dev/null +++ b/kernel/x86_64/sasum.c @@ -0,0 +1,90 @@ +#include "common.h" + +#if defined(DOUBLE) +#error supports float only +#else +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#endif + +#if defined(SKYLAKEX) +#include "sasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "sasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_SASUM_KERNEL + +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) { + sumf = sasum_kernel(n, x); + } + else { + + n *= inc_x; + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + + } + return(sumf); +} diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c new file mode 100644 index 000000000..8e6cb9a47 --- /dev/null +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -0,0 +1,82 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) + +#define HAVE_SASUM_KERNEL 1 + +#include +#include + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7; + + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); + + if (n >= 256) { + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (i = 0; i < tail_index_AVX2; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + sumf += half_accum0[0]; + + } + + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; +} +#endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c new file mode 100644 index 000000000..c8c69d1e0 --- /dev/null +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -0,0 +1,73 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SASUM_KERNEL 1 + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#include +#include + +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + n -= align_512; + x1 += align_512; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); + + if (n >= 256) { + __m512 accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + for (i = 0; i < tail_index_AVX512; i += 64) { + accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0])); + accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16])); + accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32])); + accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48])); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_ps(accum_0); + } + + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; i++) { + sumf += ABS_K(x1[i]); + } + + return sumf; +} +#endif diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index cd884febf..100f133ab 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -680,8 +680,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH