diff --git a/Makefile.system b/Makefile.system index 848c38797..91a078565 100644 --- a/Makefile.system +++ b/Makefile.system @@ -904,8 +904,8 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) -ifeq ($(FLANG_VENDOR),AOCC) +FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") +ifeq ($(FLANG_VENDOR), AMD) FCOMMON_OPT += -fno-unroll-loops endif endif diff --git a/benchmark/bench.h b/benchmark/bench.h index 1f9b8986c..83de8ab2b 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){ #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; +#elif defined(__APPLE__) + mach_timebase_info_data_t info; + uint64_t start = 0, stop = 0; #else struct timespec start = { 0, 0 }, stop = { 0, 0 }; #endif @@ -82,6 +85,9 @@ double getsec() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#elif defined(__APPLE__) + mach_timebase_info(&info); + return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; #else return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; #endif @@ -90,6 +96,8 @@ double getsec() void begin() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &start, (struct timezone *)0); +#elif defined(__APPLE__) + start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &start); #endif @@ -98,7 +106,9 @@ void begin() { void end() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &stop, (struct timezone *)0); +#elif defined(__APPLE__) + stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &stop); #endif -} \ No newline at end of file +} diff --git a/f_check b/f_check index d044f2547..fe947bf66 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { $flags = "-lomp"; } diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 999dc677a..0cdec3292 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" +#elif defined(POWER10) +#include "dasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -16; + if ( n1 > 0 ) + { + sumf += dasum_kernel_16(n1, &x[i]); + i+=n1; + } +#else n1 = n & -16; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = dasum_kernel_16(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c new file mode 100644 index 000000000..d1a21b4d1 --- /dev/null +++ b/kernel/power/dasum_microk_power10.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static double dasum_kernel_16 (long n, double *x) +{ + double sum; + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "addic. %1, %1, -16 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + XXSWAPD_S(33,32) + "xsadddp %x0, 32, 33 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=d" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} + + diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b531799a6..e918e61c3 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ @@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 3]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 2]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 3]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); @@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); @@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 733137012..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" +#elif defined(POWER10) +#include "sasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -32; + if ( n1 > 0 ) + { + sumf += sasum_kernel_32(n1, &x[i]); + i+=n1; + } +#else n1 = n & -32; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = sasum_kernel_32(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sasum_microk_power10.c b/kernel/power/sasum_microk_power10.c new file mode 100644 index 000000000..ea12a4264 --- /dev/null +++ b/kernel/power/sasum_microk_power10.c @@ -0,0 +1,153 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_32 1 + +static float sasum_kernel_32 (long n, float *x) +{ + float sum; + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "addic. %1, %1, -32 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=f" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 31b3682b9..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #elif defined(POWER10) #if defined(DOUBLE) -#include "zscal_microk_power8.c" +#include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c new file mode 100644 index 000000000..15b8323f4 --- /dev/null +++ b/kernel/power/zscal_microk_power10.c @@ -0,0 +1,195 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) +{ + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xsnegdp 33, %x10 \n\t" // -alpha_i + XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r + XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + + + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + + + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + + + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" + + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" + + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" + + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" + + "#n=%1 x=%0=%2 alpha=(%9,%10) \n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5) // 8 + : + "d" (alpha_r), // 9 + "d" (alpha_i) // 10 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c index 8e245cc8f..b5545726e 100644 --- a/kernel/x86_64/srot_microk_haswell-2.c +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -1,5 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_SROT_KERNEL 1 diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e877b1422..10c25a446 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -25,7 +25,7 @@ set(AEIGTST set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f ssvdch.f ssvdct.f ssxt1.f) -set(SEIGTST schkee.f +set(SEIGTST schkee.F sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f schkbb.f schkbd.f schkbk.f schkbl.f schkec.f schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f @@ -42,7 +42,7 @@ set(SEIGTST schkee.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyt21.f ssyt22.f) -set(CEIGTST cchkee.f +set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f @@ -62,7 +62,7 @@ set(CEIGTST cchkee.f set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) -set(DEIGTST dchkee.f +set(DEIGTST dchkee.F dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f @@ -79,7 +79,7 @@ set(DEIGTST dchkee.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyt21.f dsyt22.f) -set(ZEIGTST zchkee.f +set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index b3efebcd0..a292e4496 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -157,11 +157,11 @@ cleanobj: cleanexe: rm -f xeigtst* -schkee.o: schkee.f +schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkee.o: dchkee.f +dchkee.o: dchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkee.o: cchkee.f +cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkee.o: zchkee.f +zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/cchkee.f b/lapack-netlib/TESTING/EIG/cchkee.F similarity index 97% rename from lapack-netlib/TESTING/EIG/cchkee.f rename to lapack-netlib/TESTING/EIG/cchkee.F index f2a5f8d41..0d3d7493c 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.f +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1034,6 +1034,10 @@ * ===================================================================== PROGRAM CCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1071,7 +1075,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1084,12 +1088,16 @@ INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), + $ RESULT( 500 ) + COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), $ X( 5*NMAX ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. * .. External Functions .. LOGICAL LSAMEN REAL SECOND, SLAMCH @@ -1130,6 +1138,21 @@ DATA INTSTR / '0123456789' / DATA IOLDSD / 0, 0, 0, 1 / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * A = 0.0 @@ -1846,8 +1869,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL CERRST( 'CST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL CERRST( 'CST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2305,8 +2336,16 @@ MAXTYP = 15 NTYPES = MIN( MAXTYP, NTYPES ) CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL CERRST( 'CHB', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL CERRST( 'CHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF * CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, @@ -2436,7 +2475,14 @@ 380 CONTINUE WRITE( NOUT, FMT = 9994 ) S2 = SECOND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/dchkee.f b/lapack-netlib/TESTING/EIG/dchkee.F similarity index 98% rename from lapack-netlib/TESTING/EIG/dchkee.f rename to lapack-netlib/TESTING/EIG/dchkee.F index dc6f3205a..ee22ce33d 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.f +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1038,7 +1038,11 @@ *> \ingroup double_eig * * ===================================================================== - PROGRAM DCHKEE + PROGRAM DCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif * * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -1077,7 +1081,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1089,10 +1093,13 @@ $ PVAL( MAXIN ) INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) - DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) + DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C * .. * .. External Functions .. LOGICAL LSAMEN @@ -1132,7 +1139,18 @@ * .. * .. Data statements .. DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1856,8 +1874,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL DERRST( 'DST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL DERRST( 'DST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2436,7 +2462,12 @@ 380 CONTINUE WRITE( NOUT, FMT = 9994 ) S2 = DSECND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/schkee.f b/lapack-netlib/TESTING/EIG/schkee.F similarity index 98% rename from lapack-netlib/TESTING/EIG/schkee.f rename to lapack-netlib/TESTING/EIG/schkee.F index 3757e0655..a063c18b5 100644 --- a/lapack-netlib/TESTING/EIG/schkee.f +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1040,6 +1040,10 @@ * ===================================================================== PROGRAM SCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1077,7 +1081,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1089,10 +1093,13 @@ $ PVAL( MAXIN ) INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) - REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) + REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: WORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C * .. * .. External Functions .. LOGICAL LSAMEN @@ -1132,7 +1139,18 @@ * .. * .. Data statements .. DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1857,8 +1875,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL SERRST( 'SST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL SERRST( 'SST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2440,6 +2466,11 @@ WRITE( NOUT, FMT = 9994 ) S2 = SECOND( ) WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/zchkee.f b/lapack-netlib/TESTING/EIG/zchkee.F similarity index 97% rename from lapack-netlib/TESTING/EIG/zchkee.f rename to lapack-netlib/TESTING/EIG/zchkee.F index 6807ef7e4..29604956d 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.f +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1034,6 +1034,10 @@ * ===================================================================== PROGRAM ZCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1071,7 +1075,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1084,12 +1088,16 @@ INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), + $ RESULT( 500 ) + COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), $ X( 5*NMAX ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION DLAMCH, DSECND @@ -1130,6 +1138,21 @@ DATA INTSTR / '0123456789' / DATA IOLDSD / 0, 0, 0, 1 / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * A = 0.0 @@ -1846,8 +1869,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL ZERRST( 'ZST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL ZERRST( 'ZST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2303,8 +2334,16 @@ MAXTYP = 15 NTYPES = MIN( MAXTYP, NTYPES ) CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL ZERRST( 'ZHB', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL ZERRST( 'ZHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF * CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, @@ -2435,6 +2474,13 @@ WRITE( NOUT, FMT = 9994 ) S2 = DSECND( ) WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )