diff --git a/Makefile b/Makefile index 54dd3be41..de0735c4a 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,9 @@ endif @$(CC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ cverinfo=`$(CC) --version | sed -n '1p'`; \ + if [ -z "$${cverinfo}" ]; then \ + cverinfo=`$(CC) --version | sed -n '2p'`; \ + fi; \ echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ else \ echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(FC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ fverinfo=`$(FC) --version | sed -n '1p'`; \ + if [ -z "$${fverinfo}" ]; then \ + fverinfo=`$(FC) --version | sed -n '2p'`; \ + fi; \ echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ else \ echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ diff --git a/Makefile.power b/Makefile.power index c7e972290..946f55232 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,9 +10,11 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) diff --git a/Makefile.system b/Makefile.system index 5adde36d8..ce3a819a8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -181,7 +181,7 @@ endif # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(HOSTARCH), x86_64) -ifeq ($(findstring pgcc,$(HOSTCC)),) +ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif @@ -663,6 +663,7 @@ endif endif # ARCH zarch ifeq ($(ARCH), power) +ifneq ($(C_COMPILER), PGI) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) @@ -689,6 +690,10 @@ else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) endif endif +else +DYNAMIC_CORE = POWER8 +DYNAMIC_CORE += POWER9 +endif endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty @@ -847,9 +852,19 @@ endif endif ifeq ($(C_COMPILER), PGI) +PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) +PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) +PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) +PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) +NEWPGI := 1 +endif ifdef BINARY64 ifeq ($(ARCH), x86_64) -CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm +CCOMMON_OPT += -tp p7-64 +ifneq ($(NEWPGI),1) +CCOMMON_OPT += -D__MMX__ -Mnollvm +endif else ifeq ($(ARCH), power) ifeq ($(CORE), POWER8) @@ -1029,18 +1044,24 @@ ifeq ($(ARCH), x86_64) FCOMMON_OPT += -tp p7-64 else ifeq ($(ARCH), power) +ifeq ($(CORE), POWER6) +$(warning NVIDIA HPC compilers do not support POWER6.) +endif ifeq ($(CORE), POWER8) FCOMMON_OPT += -tp pwr8 endif ifeq ($(CORE), POWER9) FCOMMON_OPT += -tp pwr9 endif +ifeq ($(CORE), POWER10) +$(warning NVIDIA HPC compilers do not support POWER10.) +endif endif endif else FCOMMON_OPT += -tp p7 endif -FCOMMON_OPT += -Mrecursive +FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif diff --git a/README.md b/README.md index 267df5358..6c6322c32 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta ## Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version. Please read the documentation on the OpenBLAS wiki pages: . diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index a2f56d839..f9feeb6e8 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -27,7 +27,9 @@ static char *corename[] = { #define NUM_CORETYPES 4 char *gotoblas_corename(void) { +#ifndef C_PGI if (gotoblas == &gotoblas_POWER6) return corename[1]; +#endif if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; @@ -38,10 +40,157 @@ char *gotoblas_corename(void) { return corename[0]; } +#ifdef C_PGI +/* + * NV HPC compilers do not yet implement __builtin_cpu_is(). + * Fake a version here for use in the CPU detection code below. + * + * Strategy here is to first check the CPU to see what it actually is, + * and then test the input to see if what the CPU actually is matches + * what was requested. + */ + +#include + +/* + * Define POWER processor version table. + * + * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time + */ + +#define CPU_UNKNOWN 0 +#define CPU_POWER5 5 +#define CPU_POWER6 6 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + +static struct { + uint32_t pvr_mask; + uint32_t pvr_value; + const char* cpu_name; + uint32_t cpu_type; +} pvrPOWER [] = { + + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_type = CPU_POWER5, + }, + + { /* Power6 aka POWER6X*/ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8NVL */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004c0000, + .cpu_name = "POWER8NVL (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD 2.1 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0201, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD2.2 or later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004e0000, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power10 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00800000, + .cpu_name = "POWER10 (raw)", + .cpu_type = CPU_POWER10, + }, + + { /* End of table, pvr_mask and pvr_value must be zero */ + .pvr_mask = 0x0, + .pvr_value = 0x0, + .cpu_name = "Unknown", + .cpu_type = CPU_UNKNOWN, + }, +}; + +static int __builtin_cpu_is(const char *cpu) { + int i; + uint32_t pvr; + uint32_t cpu_type; + + asm("mfpvr %0" : "=r"(pvr)); + + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } + +#if defined(DEBUG) + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); +#endif + cpu_type = pvrPOWER[i].cpu_type; + + if (!strcmp(cpu, "power8")) + return cpu_type == CPU_POWER8; + if (!strcmp(cpu, "power9")) + return cpu_type == CPU_POWER9; + return 0; +} + +#endif /* C_PGI */ + static gotoblas_t *get_coretype(void) { +#ifndef C_PGI if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) return &gotoblas_POWER6; +#endif if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) @@ -77,7 +226,9 @@ static gotoblas_t *force_coretype(char * coretype) { switch (found) { +#ifndef C_PGI case 1: return (&gotoblas_POWER6); +#endif case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); diff --git a/f_check b/f_check index d20b96081..e9aca4ff9 100644 --- a/f_check +++ b/f_check @@ -32,7 +32,7 @@ if ($compiler eq "") { "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", - "pgf95", "pgf90", "pgf77", + "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", "flang", "egfortran", "ifort"); @@ -64,7 +64,6 @@ if ($compiler eq "") { if (!$?) { $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; - if ($data =~ /zhoge_/) { $bu = "_"; } @@ -87,7 +86,7 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; - } elsif ($compiler =~ /pgf/) { + } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $openmp = "-mp"; } else { @@ -123,7 +122,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($data =~ /PGF/) { + if ($data =~ /PGF/ || $data =~ /NVF/) { $vendor = PGI; $openmp = "-mp"; } @@ -177,7 +176,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($compiler =~ /pgf/) { + if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $bu = "_"; $openmp = "-mp"; @@ -330,7 +329,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { + if ($flags =~ /-lgomp/ && $CC =~ /clang/) { $flags = "-lomp"; } diff --git a/kernel/Makefile b/kernel/Makefile index 4e86546b9..1a6c9413f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake else - override CFLAGS += -march=skylake-avx512 + override CFLAGS += -march=skylake-avx512 -mavx512f endif ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables @@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) endif endif else ifeq ($(TARGET_CORE), SKYLAKEX) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables endif diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index ea010db42..074d72153 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a20d0d4a6..8333f60e6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +CNRM2KERNEL = nrm2.S #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +DNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index a20d0d4a6..4cdd8769f 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -153,13 +153,16 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - +#SNRM2KERNEL = scnrm2_thunderx2t99.c +#CNRM2KERNEL = scnrm2_thunderx2t99.c +##DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +##ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#DNRM2KERNEL = dznrm2_thunderx2t99.c +#ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c