Merge pull request #6 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-12-27 21:28:10 +01:00 committed by GitHub
commit 9b3965b08c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 209 additions and 27 deletions

View File

@ -59,6 +59,9 @@ endif
@$(CC) --version > /dev/null 2>&1;\ @$(CC) --version > /dev/null 2>&1;\
if [ $$? -eq 0 ]; then \ if [ $$? -eq 0 ]; then \
cverinfo=`$(CC) --version | sed -n '1p'`; \ cverinfo=`$(CC) --version | sed -n '1p'`; \
if [ -z "$${cverinfo}" ]; then \
cverinfo=`$(CC) --version | sed -n '2p'`; \
fi; \
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
else \ else \
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(FC) --version > /dev/null 2>&1;\ @$(FC) --version > /dev/null 2>&1;\
if [ $$? -eq 0 ]; then \ if [ $$? -eq 0 ]; then \
fverinfo=`$(FC) --version | sed -n '1p'`; \ fverinfo=`$(FC) --version | sed -n '1p'`; \
if [ -z "$${fverinfo}" ]; then \
fverinfo=`$(FC) --version | sed -n '2p'`; \
fi; \
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
else \ else \
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\

View File

@ -10,9 +10,11 @@ USE_OPENMP = 1
endif endif
ifeq ($(CORE), POWER10) ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif endif
endif
ifeq ($(CORE), POWER9) ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI) ifneq ($(C_COMPILER), PGI)

View File

@ -181,7 +181,7 @@ endif
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(HOSTARCH), x86_64) ifeq ($(HOSTARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),) ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native GETARCH_FLAGS += -march=native
endif endif
endif endif
@ -663,6 +663,7 @@ endif
endif # ARCH zarch endif # ARCH zarch
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
ifneq ($(C_COMPILER), PGI)
DYNAMIC_CORE = POWER6 DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8 DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC) ifneq ($(C_COMPILER), GCC)
@ -689,6 +690,10 @@ else
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
endif endif
endif endif
else
DYNAMIC_CORE = POWER8
DYNAMIC_CORE += POWER9
endif
endif endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@ -847,9 +852,19 @@ endif
endif endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
NEWPGI := 1
endif
ifdef BINARY64 ifdef BINARY64
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm CCOMMON_OPT += -tp p7-64
ifneq ($(NEWPGI),1)
CCOMMON_OPT += -D__MMX__ -Mnollvm
endif
else else
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
ifeq ($(CORE), POWER8) ifeq ($(CORE), POWER8)
@ -1029,18 +1044,24 @@ ifeq ($(ARCH), x86_64)
FCOMMON_OPT += -tp p7-64 FCOMMON_OPT += -tp p7-64
else else
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
ifeq ($(CORE), POWER6)
$(warning NVIDIA HPC compilers do not support POWER6.)
endif
ifeq ($(CORE), POWER8) ifeq ($(CORE), POWER8)
FCOMMON_OPT += -tp pwr8 FCOMMON_OPT += -tp pwr8
endif endif
ifeq ($(CORE), POWER9) ifeq ($(CORE), POWER9)
FCOMMON_OPT += -tp pwr9 FCOMMON_OPT += -tp pwr9
endif endif
ifeq ($(CORE), POWER10)
$(warning NVIDIA HPC compilers do not support POWER10.)
endif
endif endif
endif endif
else else
FCOMMON_OPT += -tp p7 FCOMMON_OPT += -tp p7
endif endif
FCOMMON_OPT += -Mrecursive FCOMMON_OPT += -Mrecursive -Kieee
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp FCOMMON_OPT += -mp
endif endif

View File

@ -13,7 +13,7 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta
## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.

View File

@ -27,7 +27,9 @@ static char *corename[] = {
#define NUM_CORETYPES 4 #define NUM_CORETYPES 4
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
#ifndef C_PGI
if (gotoblas == &gotoblas_POWER6) return corename[1]; if (gotoblas == &gotoblas_POWER6) return corename[1];
#endif
if (gotoblas == &gotoblas_POWER8) return corename[2]; if (gotoblas == &gotoblas_POWER8) return corename[2];
#if (!defined __GNUC__) || ( __GNUC__ >= 6) #if (!defined __GNUC__) || ( __GNUC__ >= 6)
if (gotoblas == &gotoblas_POWER9) return corename[3]; if (gotoblas == &gotoblas_POWER9) return corename[3];
@ -38,10 +40,157 @@ char *gotoblas_corename(void) {
return corename[0]; return corename[0];
} }
#ifdef C_PGI
/*
* NV HPC compilers do not yet implement __builtin_cpu_is().
* Fake a version here for use in the CPU detection code below.
*
* Strategy here is to first check the CPU to see what it actually is,
* and then test the input to see if what the CPU actually is matches
* what was requested.
*/
#include <string.h>
/*
* Define POWER processor version table.
*
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
*/
#define CPU_UNKNOWN 0
#define CPU_POWER5 5
#define CPU_POWER6 6
#define CPU_POWER8 8
#define CPU_POWER9 9
#define CPU_POWER10 10
static struct {
uint32_t pvr_mask;
uint32_t pvr_value;
const char* cpu_name;
uint32_t cpu_type;
} pvrPOWER [] = {
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
.pvr_mask = 0xffffffff,
.pvr_value = 0x0f000001,
.cpu_name = "POWER5+",
.cpu_type = CPU_POWER5,
},
{ /* Power6 aka POWER6X*/
.pvr_mask = 0xffff0000,
.pvr_value = 0x003e0000,
.cpu_name = "POWER6 (raw)",
.cpu_type = CPU_POWER6,
},
{ /* Power7 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x003f0000,
.cpu_name = "POWER7 (raw)",
.cpu_type = CPU_POWER6,
},
{ /* Power7+ */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004A0000,
.cpu_name = "POWER7+ (raw)",
.cpu_type = CPU_POWER6,
},
{ /* Power8E */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004b0000,
.cpu_name = "POWER8E (raw)",
.cpu_type = CPU_POWER8,
},
{ /* Power8NVL */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004c0000,
.cpu_name = "POWER8NVL (raw)",
.cpu_type = CPU_POWER8,
},
{ /* Power8 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004d0000,
.cpu_name = "POWER8 (raw)",
.cpu_type = CPU_POWER8,
},
{ /* Power9 DD2.0 */
.pvr_mask = 0xffffefff,
.pvr_value = 0x004e0200,
.cpu_name = "POWER9 (raw)",
.cpu_type = CPU_POWER9,
},
{ /* Power9 DD 2.1 */
.pvr_mask = 0xffffefff,
.pvr_value = 0x004e0201,
.cpu_name = "POWER9 (raw)",
.cpu_type = CPU_POWER9,
},
{ /* Power9 DD2.2 or later */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004e0000,
.cpu_name = "POWER9 (raw)",
.cpu_type = CPU_POWER9,
},
{ /* Power10 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x00800000,
.cpu_name = "POWER10 (raw)",
.cpu_type = CPU_POWER10,
},
{ /* End of table, pvr_mask and pvr_value must be zero */
.pvr_mask = 0x0,
.pvr_value = 0x0,
.cpu_name = "Unknown",
.cpu_type = CPU_UNKNOWN,
},
};
static int __builtin_cpu_is(const char *cpu) {
int i;
uint32_t pvr;
uint32_t cpu_type;
asm("mfpvr %0" : "=r"(pvr));
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
break;
}
}
#if defined(DEBUG)
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
#endif
cpu_type = pvrPOWER[i].cpu_type;
if (!strcmp(cpu, "power8"))
return cpu_type == CPU_POWER8;
if (!strcmp(cpu, "power9"))
return cpu_type == CPU_POWER9;
return 0;
}
#endif /* C_PGI */
static gotoblas_t *get_coretype(void) { static gotoblas_t *get_coretype(void) {
#ifndef C_PGI
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
return &gotoblas_POWER6; return &gotoblas_POWER6;
#endif
if (__builtin_cpu_is("power8")) if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8; return &gotoblas_POWER8;
#if (!defined __GNUC__) || ( __GNUC__ >= 6) #if (!defined __GNUC__) || ( __GNUC__ >= 6)
@ -77,7 +226,9 @@ static gotoblas_t *force_coretype(char * coretype) {
switch (found) switch (found)
{ {
#ifndef C_PGI
case 1: return (&gotoblas_POWER6); case 1: return (&gotoblas_POWER6);
#endif
case 2: return (&gotoblas_POWER8); case 2: return (&gotoblas_POWER8);
#if (!defined __GNUC__) || ( __GNUC__ >= 6) #if (!defined __GNUC__) || ( __GNUC__ >= 6)
case 3: return (&gotoblas_POWER9); case 3: return (&gotoblas_POWER9);

11
f_check
View File

@ -32,7 +32,7 @@ if ($compiler eq "") {
"xlf95", "xlf90", "xlf", "xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95", "pathf90", "pathf95",
"pgf95", "pgf90", "pgf77", "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
"flang", "egfortran", "flang", "egfortran",
"ifort"); "ifort");
@ -64,7 +64,6 @@ if ($compiler eq "") {
if (!$?) { if (!$?) {
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
if ($data =~ /zhoge_/) { if ($data =~ /zhoge_/) {
$bu = "_"; $bu = "_";
} }
@ -87,7 +86,7 @@ if ($compiler eq "") {
if ($compiler =~ /flang/) { if ($compiler =~ /flang/) {
$vendor = FLANG; $vendor = FLANG;
$openmp = "-fopenmp"; $openmp = "-fopenmp";
} elsif ($compiler =~ /pgf/) { } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI; $vendor = PGI;
$openmp = "-mp"; $openmp = "-mp";
} else { } else {
@ -123,7 +122,7 @@ if ($compiler eq "") {
$openmp = "-mp"; $openmp = "-mp";
} }
if ($data =~ /PGF/) { if ($data =~ /PGF/ || $data =~ /NVF/) {
$vendor = PGI; $vendor = PGI;
$openmp = "-mp"; $openmp = "-mp";
} }
@ -177,7 +176,7 @@ if ($compiler eq "") {
$openmp = "-mp"; $openmp = "-mp";
} }
if ($compiler =~ /pgf/) { if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI; $vendor = PGI;
$bu = "_"; $bu = "_";
$openmp = "-mp"; $openmp = "-mp";
@ -330,7 +329,7 @@ if ($link ne "") {
$flags =~ s/\@/\,/g; $flags =~ s/\@/\,/g;
$linker_L .= "-Wl,". $flags . " " ; $linker_L .= "-Wl,". $flags . " " ;
} }
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
$flags = "-lomp"; $flags = "-lomp";
} }

View File

@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCVERSIONGTEQ10), 1)
override CFLAGS += -march=cooperlake override CFLAGS += -march=cooperlake
else else
override CFLAGS += -march=skylake-avx512 override CFLAGS += -march=skylake-avx512 -mavx512f
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables override CFLAGS += -fno-asynchronous-unwind-tables
@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
endif endif
endif endif
else ifeq ($(TARGET_CORE), SKYLAKEX) else ifeq ($(TARGET_CORE), SKYLAKEX)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables override CFLAGS += -fno-asynchronous-unwind-tables
endif endif

View File

@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c SNRM2KERNEL = nrm2.S
DNRM2KERNEL = dznrm2_thunderx2t99.c DNRM2KERNEL = nrm2.S
CNRM2KERNEL = scnrm2_thunderx2t99.c CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c

View File

@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c SNRM2KERNEL = nrm2.S
CNRM2KERNEL = scnrm2_thunderx2t99.c CNRM2KERNEL = nrm2.S
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
DNRM2KERNEL = dznrm2_thunderx2t99.c DNRM2KERNEL = znrm2.S
ZNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c

View File

@ -153,13 +153,16 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c #SNRM2KERNEL = scnrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c #CNRM2KERNEL = scnrm2_thunderx2t99.c
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c ##DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c ##ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
DNRM2KERNEL = dznrm2_thunderx2t99.c #DNRM2KERNEL = dznrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c #ZNRM2KERNEL = dznrm2_thunderx2t99.c
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c