commit
525db5401c
|
@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), THUNDERX3T110)
|
||||||
|
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||||
|
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||||
|
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
|
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||||
ifeq ($(CORE), TSV110)
|
ifeq ($(CORE), TSV110)
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||||
|
|
|
@ -11,34 +11,34 @@ endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER10)
|
ifeq ($(CORE), POWER10)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
else
|
else
|
||||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math
|
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER9)
|
ifeq ($(CORE), POWER9)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
ifneq ($(C_COMPILER), PGI)
|
ifneq ($(C_COMPILER), PGI)
|
||||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
|
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
|
||||||
endif
|
endif
|
||||||
ifneq ($(F_COMPILER), PGI)
|
ifneq ($(F_COMPILER), PGI)
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
|
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
ifneq ($(C_COMPILER), PGI)
|
ifneq ($(C_COMPILER), PGI)
|
||||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
|
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||||
endif
|
endif
|
||||||
ifneq ($(F_COMPILER), PGI)
|
ifneq ($(F_COMPILER), PGI)
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -O2 -Mrecursive
|
FCOMMON_OPT += -O2 -Mrecursive
|
||||||
endif
|
endif
|
||||||
|
@ -48,26 +48,26 @@ endif
|
||||||
ifeq ($(CORE), POWER8)
|
ifeq ($(CORE), POWER8)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
ifneq ($(C_COMPILER), PGI)
|
ifneq ($(C_COMPILER), PGI)
|
||||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
|
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
|
||||||
endif
|
endif
|
||||||
ifneq ($(F_COMPILER), PGI)
|
ifneq ($(F_COMPILER), PGI)
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
|
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
ifneq ($(C_COMPILER), PGI)
|
ifneq ($(C_COMPILER), PGI)
|
||||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
|
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||||
endif
|
endif
|
||||||
ifneq ($(F_COMPILER), PGI)
|
ifneq ($(F_COMPILER), PGI)
|
||||||
ifeq ($(OSNAME), AIX)
|
ifeq ($(OSNAME), AIX)
|
||||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -O2 -Mrecursive
|
FCOMMON_OPT += -O2 -Mrecursive
|
||||||
|
|
|
@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX
|
||||||
DYNAMIC_CORE += THUNDERX2T99
|
DYNAMIC_CORE += THUNDERX2T99
|
||||||
DYNAMIC_CORE += TSV110
|
DYNAMIC_CORE += TSV110
|
||||||
DYNAMIC_CORE += EMAG8180
|
DYNAMIC_CORE += EMAG8180
|
||||||
|
DYNAMIC_CORE += THUNDERX3T110
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), zarch)
|
ifeq ($(ARCH), zarch)
|
||||||
|
@ -617,7 +618,6 @@ DYNAMIC_CORE += POWER8
|
||||||
ifneq ($(C_COMPILER), GCC)
|
ifneq ($(C_COMPILER), GCC)
|
||||||
DYNAMIC_CORE += POWER9
|
DYNAMIC_CORE += POWER9
|
||||||
DYNAMIC_CORE += POWER10
|
DYNAMIC_CORE += POWER10
|
||||||
override LDFLAGS += -Wl,-no-power10-stubs
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
ifeq ($(GCCVERSIONGT5), 1)
|
ifeq ($(GCCVERSIONGT5), 1)
|
||||||
|
@ -627,11 +627,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||||
endif
|
endif
|
||||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||||
DYNAMIC_CORE += POWER10
|
DYNAMIC_CORE += POWER10
|
||||||
override LDFLAGS += -Wl,-no-power10-stubs
|
|
||||||
else ifeq ($(GCCVERSIONGTEQ10), 1)
|
else ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||||
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
||||||
DYNAMIC_CORE += POWER10
|
DYNAMIC_CORE += POWER10
|
||||||
override LDFLAGS += -Wl,-no-power10-stubs
|
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||||
|
@ -1241,7 +1239,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||||
|
|
||||||
include $(TOPDIR)/Makefile.$(ARCH)
|
include $(TOPDIR)/Makefile.$(ARCH)
|
||||||
|
|
||||||
|
ifneq ($(C_COMPILER), PGI)
|
||||||
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
|
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
|
||||||
|
endif
|
||||||
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
||||||
|
|
||||||
ifeq ($(CORE), PPC440)
|
ifeq ($(CORE), PPC440)
|
||||||
|
|
|
@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
||||||
## Installation from Source
|
## Installation from Source
|
||||||
|
|
||||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
|
||||||
|
sure to use the develop branch - master is several years out of date due to a change of maintainership.)
|
||||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||||
Most can also be given directly on the make or cmake command line.
|
Most can also be given directly on the make or cmake command line.
|
||||||
|
|
||||||
|
|
|
@ -96,6 +96,7 @@ FALKOR
|
||||||
THUNDERX
|
THUNDERX
|
||||||
THUNDERX2T99
|
THUNDERX2T99
|
||||||
TSV110
|
TSV110
|
||||||
|
THUNDERX3T110
|
||||||
|
|
||||||
9.System Z:
|
9.System Z:
|
||||||
ZARCH_GENERIC
|
ZARCH_GENERIC
|
||||||
|
|
|
@ -45,7 +45,7 @@ endif ()
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
if (ARM64)
|
if (ARM64)
|
||||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
|
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (POWER)
|
if (POWER)
|
||||||
|
|
|
@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||||
"#define HAVE_VFP\n"
|
"#define HAVE_VFP\n"
|
||||||
"#define HAVE_NEON\n"
|
"#define HAVE_NEON\n"
|
||||||
"#define ARMV8\n")
|
"#define ARMV8\n")
|
||||||
|
if ("${TCORE}" STREQUAL "CORTEXA57")
|
||||||
set(SGEMM_UNROLL_M 16)
|
set(SGEMM_UNROLL_M 16)
|
||||||
set(SGEMM_UNROLL_N 4)
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
else ()
|
||||||
|
set(SGEMM_UNROLL_M 8)
|
||||||
|
set(SGEMM_UNROLL_N 8)
|
||||||
|
endif ()
|
||||||
set(DGEMM_UNROLL_M 8)
|
set(DGEMM_UNROLL_M 8)
|
||||||
set(DGEMM_UNROLL_N 4)
|
set(DGEMM_UNROLL_N 4)
|
||||||
set(CGEMM_UNROLL_M 8)
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||||
set(ZGEMM_UNROLL_M 4)
|
set(ZGEMM_UNROLL_M 4)
|
||||||
set(ZGEMM_UNROLL_N 4)
|
set(ZGEMM_UNROLL_N 4)
|
||||||
set(SYMV_P 16)
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "THUNDERX3T110")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define THUNDERX3T110\n"
|
||||||
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t8\n"
|
||||||
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t8\n"
|
||||||
|
"#define L2_SIZE\t524288\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define L3_SIZE\t94371840\n"
|
||||||
|
"#define L3_LINESIZE\t64\n"
|
||||||
|
"#define L3_ASSOCIATIVE\t32\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
elseif ("${TCORE}" STREQUAL "TSV110")
|
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define ARMV8\n"
|
"#define ARMV8\n"
|
||||||
|
|
|
@ -40,6 +40,7 @@
|
||||||
// Cavium
|
// Cavium
|
||||||
#define CPU_THUNDERX 7
|
#define CPU_THUNDERX 7
|
||||||
#define CPU_THUNDERX2T99 8
|
#define CPU_THUNDERX2T99 8
|
||||||
|
#define CPU_THUNDERX3T110 12
|
||||||
//Hisilicon
|
//Hisilicon
|
||||||
#define CPU_TSV110 9
|
#define CPU_TSV110 9
|
||||||
// Ampere
|
// Ampere
|
||||||
|
@ -57,7 +58,8 @@ static char *cpuname[] = {
|
||||||
"THUNDERX2T99",
|
"THUNDERX2T99",
|
||||||
"TSV110",
|
"TSV110",
|
||||||
"EMAG8180",
|
"EMAG8180",
|
||||||
"NEOVERSEN1"
|
"NEOVERSEN1",
|
||||||
|
"THUNDERX3T110"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *cpuname_lower[] = {
|
static char *cpuname_lower[] = {
|
||||||
|
@ -72,7 +74,8 @@ static char *cpuname_lower[] = {
|
||||||
"thunderx2t99",
|
"thunderx2t99",
|
||||||
"tsv110",
|
"tsv110",
|
||||||
"emag8180",
|
"emag8180",
|
||||||
"neoversen1"
|
"neoversen1",
|
||||||
|
"thunderx3t110"
|
||||||
};
|
};
|
||||||
|
|
||||||
int get_feature(char *search)
|
int get_feature(char *search)
|
||||||
|
@ -158,6 +161,8 @@ int detect(void)
|
||||||
return CPU_THUNDERX;
|
return CPU_THUNDERX;
|
||||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||||
return CPU_THUNDERX2T99;
|
return CPU_THUNDERX2T99;
|
||||||
|
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8"))
|
||||||
|
return CPU_THUNDERX3T110;
|
||||||
// HiSilicon
|
// HiSilicon
|
||||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||||
return CPU_TSV110;
|
return CPU_TSV110;
|
||||||
|
@ -372,7 +377,25 @@ void get_cpuconfig(void)
|
||||||
printf("#define L2_LINESIZE 64\n");
|
printf("#define L2_LINESIZE 64\n");
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
printf("#define DTB_SIZE 4096\n");
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CPU_THUNDERX3T110:
|
||||||
|
printf("#define THUNDERX3T110 \n");
|
||||||
|
printf("#define L1_CODE_SIZE 65536 \n");
|
||||||
|
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||||
|
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768 \n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||||
|
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||||
|
printf("#define L2_SIZE 524288 \n");
|
||||||
|
printf("#define L2_LINESIZE 64 \n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||||
|
printf("#define L3_SIZE 94371840 \n");
|
||||||
|
printf("#define L3_LINESIZE 64 \n");
|
||||||
|
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||||
|
printf("#define DTB_SIZE 4096 \n");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
get_cpucount();
|
get_cpucount();
|
||||||
}
|
}
|
||||||
|
|
23
cpuid_x86.c
23
cpuid_x86.c
|
@ -1454,10 +1454,11 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_OPTERON;
|
return CPUTYPE_OPTERON;
|
||||||
case 1:
|
case 1:
|
||||||
case 3:
|
case 3:
|
||||||
case 7:
|
// case 7:
|
||||||
case 10:
|
// case 10:
|
||||||
return CPUTYPE_BARCELONA;
|
return CPUTYPE_BARCELONA;
|
||||||
case 5:
|
case 5:
|
||||||
|
case 7:
|
||||||
return CPUTYPE_BOBCAT;
|
return CPUTYPE_BOBCAT;
|
||||||
case 6:
|
case 6:
|
||||||
switch (model) {
|
switch (model) {
|
||||||
|
@ -1507,6 +1508,8 @@ int get_cpuname(void){
|
||||||
// AMD Ryzen
|
// AMD Ryzen
|
||||||
case 8:
|
case 8:
|
||||||
// AMD Ryzen2
|
// AMD Ryzen2
|
||||||
|
default:
|
||||||
|
// Matisse/Renoir and other recent Ryzen2
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
#ifndef NO_AVX2
|
#ifndef NO_AVX2
|
||||||
return CPUTYPE_ZEN;
|
return CPUTYPE_ZEN;
|
||||||
|
@ -1516,6 +1519,16 @@ int get_cpuname(void){
|
||||||
else
|
else
|
||||||
return CPUTYPE_BARCELONA;
|
return CPUTYPE_BARCELONA;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case 10: // Zen3
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CPUTYPE_ZEN;
|
||||||
|
#else
|
||||||
|
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -2107,7 +2120,7 @@ int get_coretype(void){
|
||||||
return CORE_PILEDRIVER;
|
return CORE_PILEDRIVER;
|
||||||
else
|
else
|
||||||
return CORE_BARCELONA; //OS don't support AVX.
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
case 5: // New EXCAVATOR
|
case 5: // New EXCAVATOR
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return CORE_EXCAVATOR;
|
return CORE_EXCAVATOR;
|
||||||
else
|
else
|
||||||
|
@ -2135,12 +2148,14 @@ int get_coretype(void){
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (exfamily == 8) {
|
} else if (exfamily == 8 || exfamily == 10) {
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 1:
|
case 1:
|
||||||
// AMD Ryzen
|
// AMD Ryzen
|
||||||
case 8:
|
case 8:
|
||||||
// Ryzen 2
|
// Ryzen 2
|
||||||
|
default:
|
||||||
|
// Matisse,Renoir Ryzen2 models
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
#ifndef NO_AVX2
|
#ifndef NO_AVX2
|
||||||
return CORE_ZEN;
|
return CORE_ZEN;
|
||||||
|
|
|
@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){
|
||||||
if ((exfamily == 0) || (exfamily == 2)) {
|
if ((exfamily == 0) || (exfamily == 2)) {
|
||||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
||||||
else return &gotoblas_OPTERON;
|
else return &gotoblas_OPTERON;
|
||||||
} else if (exfamily == 5) {
|
} else if (exfamily == 5 || exfamily == 7) {
|
||||||
return &gotoblas_BOBCAT;
|
return &gotoblas_BOBCAT;
|
||||||
} else if (exfamily == 6) {
|
} else if (exfamily == 6) {
|
||||||
if(model == 1){
|
if(model == 1){
|
||||||
|
@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (exfamily == 8) {
|
} else if (exfamily == 8) {
|
||||||
if (model == 1 || model == 8) {
|
/* if (model == 1 || model == 8) */ {
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_ZEN;
|
return &gotoblas_ZEN;
|
||||||
else{
|
else{
|
||||||
|
@ -724,10 +724,18 @@ static gotoblas_t *get_coretype(void){
|
||||||
else{
|
else{
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
|
} else if (exfamily == 10) {
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_ZEN;
|
||||||
|
else{
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||||
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
}else {
|
}else {
|
||||||
return &gotoblas_BARCELONA;
|
return &gotoblas_BARCELONA;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||||
extern gotoblas_t gotoblas_TSV110;
|
extern gotoblas_t gotoblas_TSV110;
|
||||||
extern gotoblas_t gotoblas_EMAG8180;
|
extern gotoblas_t gotoblas_EMAG8180;
|
||||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||||
|
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
|
|
||||||
#define NUM_CORETYPES 11
|
#define NUM_CORETYPES 12
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||||
|
@ -82,6 +83,7 @@ static char *corename[] = {
|
||||||
"tsv110",
|
"tsv110",
|
||||||
"emag8180",
|
"emag8180",
|
||||||
"neoversen1",
|
"neoversen1",
|
||||||
|
"thunderx3t110",
|
||||||
"unknown"
|
"unknown"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -97,6 +99,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||||
|
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||||
return corename[NUM_CORETYPES];
|
return corename[NUM_CORETYPES];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||||
case 8: return (&gotoblas_TSV110);
|
case 8: return (&gotoblas_TSV110);
|
||||||
case 9: return (&gotoblas_EMAG8180);
|
case 9: return (&gotoblas_EMAG8180);
|
||||||
case 10: return (&gotoblas_NEOVERSEN1);
|
case 10: return (&gotoblas_NEOVERSEN1);
|
||||||
|
case 11: return (&gotoblas_THUNDERX3T110);
|
||||||
}
|
}
|
||||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||||
openblas_warning(1, message);
|
openblas_warning(1, message);
|
||||||
|
@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) {
|
||||||
return &gotoblas_THUNDERX;
|
return &gotoblas_THUNDERX;
|
||||||
case 0x0af: // ThunderX2
|
case 0x0af: // ThunderX2
|
||||||
return &gotoblas_THUNDERX2T99;
|
return &gotoblas_THUNDERX2T99;
|
||||||
|
case 0x0b8: // ThunderX3
|
||||||
|
return &gotoblas_THUNDERX3T110;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 0x48: // HiSilicon
|
case 0x48: // HiSilicon
|
||||||
|
|
18
getarch.c
18
getarch.c
|
@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "EMAG8180"
|
#define CORENAME "EMAG8180"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_THUNDERX3T110
|
||||||
|
#define ARMV8
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "ARM64"
|
||||||
|
#define SUBARCHITECTURE "THUNDERX3T110"
|
||||||
|
#define SUBDIRNAME "arm64"
|
||||||
|
#define ARCHCONFIG "-DTHUNDERX3T110 " \
|
||||||
|
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||||
|
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||||
|
"-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||||
|
#define LIBNAME "thunderx3t110"
|
||||||
|
#define CORENAME "THUNDERX3T110"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_ZARCH_GENERIC
|
#ifdef FORCE_ZARCH_GENERIC
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define ARCHITECTURE "ZARCH"
|
#define ARCHITECTURE "ZARCH"
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
#include "functable.h"
|
#include "functable.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
|
||||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||||
#else
|
#else
|
||||||
// Disable multi-threading as it does not show any performance
|
// Disable multi-threading as it does not show any performance
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
#include "functable.h"
|
#include "functable.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
|
||||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||||
#else
|
#else
|
||||||
// Disable multi-threading as it does not show any performance
|
// Disable multi-threading as it does not show any performance
|
||||||
|
|
|
@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC)
|
||||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), power)
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
override CFLAGS += -fno-integrated-as
|
||||||
|
endif
|
||||||
|
endif
|
||||||
AVX2OPT =
|
AVX2OPT =
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
# AVX2 support was added in 4.7.0
|
# AVX2 support was added in 4.7.0
|
||||||
|
|
|
@ -44,8 +44,10 @@ USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER8)
|
ifeq ($(CORE), POWER8)
|
||||||
|
ifeq ($(BINARY64),1)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER9)
|
ifeq ($(CORE), POWER9)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
|
|
|
@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
|
|
||||||
dot[0]=0.0;
|
dot[0]=0.0;
|
||||||
dot[1]=0.0;
|
dot[1]=0.0;
|
||||||
|
#if !defined(__PPC__)
|
||||||
CREAL(result) = 0.0 ;
|
CREAL(result) = 0.0 ;
|
||||||
CIMAG(result) = 0.0 ;
|
CIMAG(result) = 0.0 ;
|
||||||
|
#else
|
||||||
|
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||||
|
#endif
|
||||||
if ( n < 1 ) return(result);
|
if ( n < 1 ) return(result);
|
||||||
|
|
||||||
inc_x2 = 2 * inc_x ;
|
inc_x2 = 2 * inc_x ;
|
||||||
|
@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
i++ ;
|
i++ ;
|
||||||
|
|
||||||
}
|
}
|
||||||
CREAL(result) = dot[0];
|
#if !defined(__POWER__)
|
||||||
|
CREAL(result) = dot[0];
|
||||||
CIMAG(result) = dot[1];
|
CIMAG(result) = dot[1];
|
||||||
|
#else
|
||||||
|
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]);
|
||||||
|
#endif
|
||||||
return(result);
|
return(result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,184 @@
|
||||||
|
SAMINKERNEL = ../arm/amin.c
|
||||||
|
DAMINKERNEL = ../arm/amin.c
|
||||||
|
CAMINKERNEL = ../arm/zamin.c
|
||||||
|
ZAMINKERNEL = ../arm/zamin.c
|
||||||
|
|
||||||
|
SMAXKERNEL = ../arm/max.c
|
||||||
|
DMAXKERNEL = ../arm/max.c
|
||||||
|
|
||||||
|
SMINKERNEL = ../arm/min.c
|
||||||
|
DMINKERNEL = ../arm/min.c
|
||||||
|
|
||||||
|
ISAMINKERNEL = ../arm/iamin.c
|
||||||
|
IDAMINKERNEL = ../arm/iamin.c
|
||||||
|
ICAMINKERNEL = ../arm/izamin.c
|
||||||
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
|
|
||||||
|
ISMAXKERNEL = ../arm/imax.c
|
||||||
|
IDMAXKERNEL = ../arm/imax.c
|
||||||
|
|
||||||
|
ISMINKERNEL = ../arm/imin.c
|
||||||
|
IDMINKERNEL = ../arm/imin.c
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
SAMAXKERNEL = amax.S
|
||||||
|
DAMAXKERNEL = amax.S
|
||||||
|
CAMAXKERNEL = zamax.S
|
||||||
|
ZAMAXKERNEL = zamax.S
|
||||||
|
|
||||||
|
SAXPYKERNEL = axpy.S
|
||||||
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
|
CAXPYKERNEL = zaxpy.S
|
||||||
|
ZAXPYKERNEL = zaxpy.S
|
||||||
|
|
||||||
|
SROTKERNEL = rot.S
|
||||||
|
DROTKERNEL = rot.S
|
||||||
|
CROTKERNEL = zrot.S
|
||||||
|
ZROTKERNEL = zrot.S
|
||||||
|
|
||||||
|
SSCALKERNEL = scal.S
|
||||||
|
DSCALKERNEL = scal.S
|
||||||
|
CSCALKERNEL = zscal.S
|
||||||
|
ZSCALKERNEL = zscal.S
|
||||||
|
|
||||||
|
SGEMVNKERNEL = gemv_n.S
|
||||||
|
DGEMVNKERNEL = gemv_n.S
|
||||||
|
CGEMVNKERNEL = zgemv_n.S
|
||||||
|
ZGEMVNKERNEL = zgemv_n.S
|
||||||
|
|
||||||
|
SGEMVTKERNEL = gemv_t.S
|
||||||
|
DGEMVTKERNEL = gemv_t.S
|
||||||
|
CGEMVTKERNEL = zgemv_t.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t.S
|
||||||
|
|
||||||
|
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
|
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||||
|
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||||
|
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||||
|
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
SASUMKERNEL = sasum_thunderx2t99.c
|
||||||
|
DASUMKERNEL = dasum_thunderx2t99.c
|
||||||
|
CASUMKERNEL = casum_thunderx2t99.c
|
||||||
|
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||||
|
|
||||||
|
SCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
DCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
CCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
|
||||||
|
SSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
DSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
CSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
|
||||||
|
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
|
||||||
|
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||||
|
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||||
|
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
|
||||||
|
|
||||||
|
DDOTKERNEL = dot_thunderx2t99.c
|
||||||
|
SDOTKERNEL = dot_thunderx2t99.c
|
||||||
|
CDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
DSDOTKERNEL = dot.S
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||||
|
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||||
|
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||||
|
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||||
|
endif
|
|
@ -1,3 +1,44 @@
|
||||||
|
# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
|
||||||
|
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||||
|
SGEMMKERNEL = gemm_kernel_power6.S
|
||||||
|
SGEMMINCOPY =
|
||||||
|
SGEMMITCOPY =
|
||||||
|
SGEMMONCOPY = gemm_ncopy_4.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_4.S
|
||||||
|
SGEMMINCOPYOBJ =
|
||||||
|
SGEMMITCOPYOBJ =
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = gemm_kernel_power6.S
|
||||||
|
DGEMMINCOPY =
|
||||||
|
DGEMMITCOPY =
|
||||||
|
DGEMMONCOPY = gemm_ncopy_4.S
|
||||||
|
DGEMMOTCOPY = gemm_tcopy_4.S
|
||||||
|
DGEMMINCOPYOBJ =
|
||||||
|
DGEMMITCOPYOBJ =
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_power6.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_power6.S
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_power6_LN.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_power6_LT.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_power6_LT.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_power6_RT.S
|
||||||
|
else
|
||||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
endif
|
||||||
|
|
||||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c
|
||||||
#
|
#
|
||||||
SAXPYKERNEL = saxpy.c
|
SAXPYKERNEL = saxpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
|
#
|
||||||
|
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||||
|
CAXPYKERNEL = zaxpy.S
|
||||||
|
else
|
||||||
ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
CAXPYKERNEL = caxpy_power8.S
|
CAXPYKERNEL = caxpy_power8.S
|
||||||
|
@ -162,6 +215,7 @@ endif
|
||||||
else
|
else
|
||||||
CAXPYKERNEL = caxpy.c
|
CAXPYKERNEL = caxpy.c
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
#
|
#
|
||||||
ZAXPYKERNEL = zaxpy.c
|
ZAXPYKERNEL = zaxpy.c
|
||||||
#
|
#
|
||||||
|
@ -239,4 +293,3 @@ IDAMINKERNEL = ../arm/iamin.c
|
||||||
IZAMAXKERNEL = ../arm/izamax.c
|
IZAMAXKERNEL = ../arm/izamax.c
|
||||||
IZAMINKERNEL = ../arm/izamin.c
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "casum_microk_power8.c"
|
#include "casum_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "ccopy_microk_power8.c"
|
#include "ccopy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/zdot.c"
|
||||||
|
#else
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
|
@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
return (result);
|
return (result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/zgemv_n.c"
|
||||||
|
#else
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/zgemv_t.c"
|
||||||
|
#else
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
{
|
{
|
||||||
|
@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||||
|
@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ( (inc_x == 1) && (inc_y == 1) )
|
||||||
{
|
{
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
BLASLONG n1 = n & -8;
|
BLASLONG n1 = n & -8;
|
||||||
if ( n1 > 0 )
|
if ( n1 > 0 )
|
||||||
{
|
{
|
||||||
|
@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
i=n1;
|
i=n1;
|
||||||
ix=2*n1;
|
ix=2*n1;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
while(i < n)
|
while(i < n)
|
||||||
{
|
{
|
||||||
temp[0] = c*x[ix] + s*y[ix] ;
|
temp[0] = c*x[ix] + s*y[ix] ;
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "cswap_microk_power8.c"
|
#include "cswap_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "dasum_microk_power8.c"
|
#include "dasum_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "daxpy_microk_power8.c"
|
#include "daxpy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "dcopy_microk_power8.c"
|
#include "dcopy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "ddot_microk_power8.c"
|
#include "ddot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
|
|
|
@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
typedef __vector unsigned char vec_t;
|
||||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
#define SAVE_ACC(ACC, J) \
|
#define SAVE_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] = result[3] * alpha; \
|
rowC[0] = result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha; \
|
rowC[0] = result[1] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
rowC[0] = result[1] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
|
||||||
rowC[0] = result[0] * alpha;
|
|
||||||
#define SAVE_ACC1(ACC, J) \
|
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
|
||||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
|
||||||
rowC[0] = result[3] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
|
||||||
rowC[0] = result[2] * alpha; \
|
rowC[0] = result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
|
rowC[0] = result[3] * alpha;
|
||||||
|
#define SAVE_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] = result[0] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
rowC[0] = result[1] * alpha; \
|
rowC[0] = result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] = result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
rowC[0] = result[0] * alpha;
|
rowC[0] = result[3] * alpha;
|
||||||
#define SAVE2x4_ACC(ACC, J) \
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] = result[3] * alpha; \
|
rowC[0] = result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha;
|
rowC[0] = result[1] * alpha;
|
||||||
#else
|
#else
|
||||||
#define SAVE_ACC(ACC, J) \
|
#define SAVE_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
rowC[0] += result[1] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
rowC[0] += result[1] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
|
||||||
rowC[0] += result[0] * alpha;
|
|
||||||
#define SAVE_ACC1(ACC, J) \
|
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
|
||||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
|
||||||
rowC[0] += result[3] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
|
||||||
rowC[0] += result[2] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
|
rowC[0] += result[3] * alpha;
|
||||||
|
#define SAVE_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] += result[0] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
rowC[0] += result[1] * alpha; \
|
rowC[0] += result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[3] * alpha;
|
||||||
#define SAVE2x4_ACC(ACC, J) \
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha;
|
rowC[0] += result[1] * alpha;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SET_ACC_ZERO4() \
|
#define SET_ACC_ZERO4() \
|
||||||
|
|
|
@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "dgemv_n_microk_power8.c"
|
#include "dgemv_n_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define NBMAX 4096
|
#define NBMAX 4096
|
||||||
|
|
|
@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/gemv_t.c"
|
||||||
|
#else
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#define NBMAX 1024
|
#define NBMAX 1024
|
||||||
//#define PREFETCH 1
|
//#define PREFETCH 1
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
#define HAVE_KERNEL4x8_ASM 1
|
#define HAVE_KERNEL4x8_ASM 1
|
||||||
|
|
||||||
|
|
||||||
#if defined(HAVE_KERNEL4x8_ASM)
|
#if defined(HAVE_KERNEL4x8_ASM)
|
||||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
||||||
|
|
||||||
|
@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"stxvd2x 39, %[off], %[y] \n\t"
|
"stxvd2x 39, %[off], %[y] \n\t"
|
||||||
"stxvd2x 40, %[off2], %[y] \n\t"
|
"stxvd2x 40, %[off2], %[y] \n\t"
|
||||||
|
|
||||||
: [memy] "+m" (*(const double (*)[8])y),
|
: [memy] "+m" (*(double (*)[8])y),
|
||||||
[n] "+&r" (n),
|
[n] "+&r" (n),
|
||||||
[a0] "=b" (a0),
|
[a0] "=b" (a0),
|
||||||
[a1] "=&b" (a1),
|
[a1] "=&b" (a1),
|
||||||
|
@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
[off2]"=&b" (off2),
|
[off2]"=&b" (off2),
|
||||||
[temp] "=&b" (tempR)
|
[temp] "=&b" (tempR)
|
||||||
: [memx] "m" (*(const double (*)[n])x),
|
: [memx] "m" (*(const double (*)[n])x),
|
||||||
[mem_ap] "m" (*(const double (*)[]) ap),
|
[mem_ap] "m" (*(const double (*)[n*8]) ap),
|
||||||
[alpha] "d" (alpha),
|
[alpha] "d" (alpha),
|
||||||
"[a0]" (ap),
|
"[a0]" (ap),
|
||||||
[x] "b" (x),
|
[x] "b" (x),
|
||||||
|
@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#pragma GCC optimize "O1"
|
#pragma GCC optimize "O1"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "drot_microk_power8.c"
|
#include "drot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "dscal_microk_power8.c"
|
#include "dscal_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(HAVE_KERNEL_8)
|
#if !defined(HAVE_KERNEL_8)
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "dswap_microk_power8.c"
|
#include "dswap_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
|
@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find maximum index
|
* Find maximum index
|
||||||
* Warning: requirements n>0 and n % 32 == 0
|
* Warning: requirements n>0 and n % 32 == 0
|
||||||
|
@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
return index;
|
return index;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
|
@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
if (n1 > 0) {
|
if (n1 > 0) {
|
||||||
|
|
||||||
max = diamax_kernel_32(n1, x, &maxf);
|
max = diamax_kernel_32(n1, x, &maxf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (ABS(x[i]) > maxf) {
|
if (ABS(x[i]) > maxf) {
|
||||||
|
|
|
@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find minimum index
|
* Find minimum index
|
||||||
* Warning: requirements n>0 and n % 32 == 0
|
* Warning: requirements n>0 and n % 32 == 0
|
||||||
|
@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
return index;
|
return index;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if (n1 > 0) {
|
if (n1 > 0) {
|
||||||
|
|
||||||
min = diamin_kernel_32(n1, x, &minf);
|
min = diamin_kernel_32(n1, x, &minf);
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (ABS(x[i]) < minf) {
|
if (ABS(x[i]) < minf) {
|
||||||
|
|
|
@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find maximum index
|
* Find maximum index
|
||||||
|
@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if (n1 > 0) {
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
i = n1;
|
i = n1;
|
||||||
ix = n1 << 1;
|
ix = n1 << 1;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
while(i < n)
|
while(i < n)
|
||||||
|
|
|
@ -25,13 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||||
|
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find minimum index
|
* Find minimum index
|
||||||
|
@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
minf = CABS1(x,0); //index will not be incremented
|
minf = CABS1(x,0); //index will not be incremented
|
||||||
|
|
||||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if (n1 > 0) {
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
i = n1;
|
i = n1;
|
||||||
ix = n1 << 1;
|
ix = n1 << 1;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
while(i < n)
|
while(i < n)
|
||||||
|
@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "sasum_microk_power8.c"
|
#include "sasum_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
|
@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#define offset_0 0
|
||||||
|
#define offset_1 16
|
||||||
|
#define offset_2 32
|
||||||
|
#define offset_3 48
|
||||||
|
#define offset_4 64
|
||||||
|
#define offset_5 80
|
||||||
|
#define offset_6 96
|
||||||
|
#define offset_7 112
|
||||||
|
#define offset_8 128
|
||||||
|
#define offset_9 144
|
||||||
|
#define offset_10 160
|
||||||
|
#define offset_11 176
|
||||||
|
#define offset_12 192
|
||||||
|
#define offset_13 208
|
||||||
|
#define offset_14 224
|
||||||
|
#define offset_15 240
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||||
{
|
{
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
__vector float v_a = {alpha,alpha,alpha,alpha};
|
__vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha};
|
||||||
__vector float * v_y=(__vector float *)y;
|
__vector float * vptr_y =(__vector float *)y;
|
||||||
__vector float * v_x=(__vector float *)x;
|
__vector float * vptr_x =(__vector float *)x;
|
||||||
|
|
||||||
for(; i<n/4; i+=16){
|
for(; i<n/4; i+=16){
|
||||||
|
|
||||||
|
|
||||||
|
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||||
|
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||||
|
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||||
|
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||||
|
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
|
||||||
|
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
|
||||||
|
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
|
||||||
|
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
|
||||||
|
register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
|
||||||
|
register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
|
||||||
|
register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
|
||||||
|
register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
|
||||||
|
register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
|
||||||
|
register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
|
||||||
|
register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
|
||||||
|
register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
|
||||||
|
|
||||||
|
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||||
|
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||||
|
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||||
|
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||||
|
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
|
||||||
|
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
|
||||||
|
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
|
||||||
|
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
|
||||||
|
register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
|
||||||
|
register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
|
||||||
|
register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
|
||||||
|
register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
|
||||||
|
register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
|
||||||
|
register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
|
||||||
|
register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
|
||||||
|
register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
|
||||||
|
vy_0 += vx_0*v_a;
|
||||||
|
vy_1 += vx_1*v_a;
|
||||||
|
vy_2 += vx_2*v_a;
|
||||||
|
vy_3 += vx_3*v_a;
|
||||||
|
vy_4 += vx_4*v_a;
|
||||||
|
vy_5 += vx_5*v_a;
|
||||||
|
vy_6 += vx_6*v_a;
|
||||||
|
vy_7 += vx_7*v_a;
|
||||||
|
vy_8 += vx_8*v_a;
|
||||||
|
vy_9 += vx_9*v_a;
|
||||||
|
vy_10 += vx_10*v_a;
|
||||||
|
vy_11 += vx_11*v_a;
|
||||||
|
vy_12 += vx_12*v_a;
|
||||||
|
vy_13 += vx_13*v_a;
|
||||||
|
vy_14 += vx_14*v_a;
|
||||||
|
vy_15 += vx_15*v_a;
|
||||||
|
|
||||||
|
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
|
||||||
|
vec_vsx_st( vy_15, offset_15 ,vptr_y ) ;
|
||||||
|
|
||||||
|
vptr_x+=16;
|
||||||
|
vptr_y+=16;
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
v_y[i] += v_a * v_x[i];
|
v_y[i] += v_a * v_x[i];
|
||||||
v_y[i+1] += v_a * v_x[i+1];
|
v_y[i+1] += v_a * v_x[i+1];
|
||||||
v_y[i+2] += v_a * v_x[i+2];
|
v_y[i+2] += v_a * v_x[i+2];
|
||||||
|
@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||||
v_y[i+13] += v_a * v_x[i+13];
|
v_y[i+13] += v_a * v_x[i+13];
|
||||||
v_y[i+14] += v_a * v_x[i+14];
|
v_y[i+14] += v_a * v_x[i+14];
|
||||||
v_y[i+15] += v_a * v_x[i+15];
|
v_y[i+15] += v_a * v_x[i+15];
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||||
{
|
{
|
||||||
|
@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
{
|
{
|
||||||
|
|
||||||
BLASLONG n1 = n & -64;
|
BLASLONG n1 = n & -64;
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
if ( n1 )
|
if ( n1 )
|
||||||
saxpy_kernel_64(n1, x, y, da);
|
saxpy_kernel_64(n1, x, y, da);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
|
#endif
|
||||||
while(i < n)
|
while(i < n)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "scopy_microk_power8.c"
|
#include "scopy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
#include "sdot_microk_power8.c"
|
#include "sdot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
|
@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
typedef __vector unsigned char vec_t;
|
||||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#define SAVE_ACC(ACC, J) \
|
#define SAVE_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] = result[3] * alpha; \
|
rowC[0] = result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha; \
|
rowC[0] = result[1] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
rowC[0] = result[1] * alpha; \
|
rowC[0] = result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
rowC[0] = result[0] * alpha;
|
rowC[0] = result[3] * alpha;
|
||||||
#define SAVE_ACC1(ACC, J) \
|
#define SAVE_ACC1(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
rowC[0] = result[3] * alpha; \
|
rowC[0] = result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
|
||||||
rowC[0] = result[1] * alpha; \
|
rowC[0] = result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] = result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
rowC[0] = result[0] * alpha;
|
rowC[0] = result[3] * alpha;
|
||||||
#define SAVE4x2_ACC(ACC, J) \
|
#define SAVE4x2_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] = result[6] * alpha; \
|
rowC[0] = result[0] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] = result[4] * alpha; \
|
rowC[0] = result[2] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha; \
|
|
||||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
|
||||||
rowC[0] = result[0] * alpha;
|
|
||||||
#define SAVE4x2_ACC1(ACC, J) \
|
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
|
||||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
|
||||||
rowC[0] = result[6] * alpha; \
|
|
||||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
|
||||||
rowC[0] = result[4] * alpha; \
|
rowC[0] = result[4] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||||
|
rowC[0] = result[6] * alpha;
|
||||||
|
#define SAVE4x2_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
|
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] = result[0] * alpha; \
|
||||||
|
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha; \
|
rowC[0] = result[2] * alpha; \
|
||||||
|
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||||
|
rowC[0] = result[4] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||||
rowC[0] = result[0] * alpha;
|
rowC[0] = result[6] * alpha;
|
||||||
#define SAVE2x4_ACC(ACC, J) \
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] = result[3] * alpha; \
|
rowC[0] = result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] = result[2] * alpha;
|
rowC[0] = result[1] * alpha;
|
||||||
#else
|
#else
|
||||||
#define SAVE_ACC(ACC, J) \
|
#define SAVE_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
rowC[0] += result[1] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
rowC[0] += result[1] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[3] * alpha;
|
||||||
#define SAVE_ACC1(ACC, J) \
|
#define SAVE_ACC1(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
|
||||||
rowC[0] += result[1] * alpha; \
|
rowC[0] += result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[3] * alpha;
|
||||||
#define SAVE4x2_ACC(ACC, J) \
|
#define SAVE4x2_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[6] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] += result[4] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
|
||||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
|
||||||
rowC[0] += result[0] * alpha;
|
|
||||||
#define SAVE4x2_ACC1(ACC, J) \
|
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
|
||||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
|
||||||
rowC[0] += result[6] * alpha; \
|
|
||||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
|
||||||
rowC[0] += result[4] * alpha; \
|
rowC[0] += result[4] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||||
|
rowC[0] += result[6] * alpha;
|
||||||
|
#define SAVE4x2_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
|
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] += result[0] * alpha; \
|
||||||
|
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
|
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||||
|
rowC[0] += result[4] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[6] * alpha;
|
||||||
#define SAVE2x4_ACC(ACC, J) \
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha;
|
rowC[0] += result[1] * alpha;
|
||||||
#endif
|
#endif
|
||||||
#define KERNEL(i, j) \
|
#define KERNEL(i, j) \
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
|
__builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
|
||||||
|
|
|
@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/gemv_n.c"
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return(0);
|
return(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/gemv_t.c"
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
|
@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16)
|
||||||
#define BF16TOF32(x) x
|
#define BF16TOF32(x) x
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
typedef __vector unsigned char vec_t;
|
||||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
|
|
||||||
|
@ -64,54 +64,54 @@ vector char mask =
|
||||||
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
|
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
|
||||||
|
|
||||||
#define SAVE_ACC(ACC, J) \
|
#define SAVE_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
rowC[0] += result[1] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
rowC[0] += result[1] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[3] * alpha;
|
||||||
#define SAVE_ACC1(ACC, J) \
|
#define SAVE_ACC1(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
|
||||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
|
||||||
rowC[0] += result[1] * alpha; \
|
rowC[0] += result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[3] * alpha;
|
||||||
#define SAVE4x2_ACC(ACC, J) \
|
#define SAVE4x2_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[6] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] += result[4] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
|
||||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
|
||||||
rowC[0] += result[0] * alpha;
|
|
||||||
#define SAVE4x2_ACC1(ACC, J) \
|
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
|
||||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
|
||||||
rowC[0] += result[6] * alpha; \
|
|
||||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
|
||||||
rowC[0] += result[4] * alpha; \
|
rowC[0] += result[4] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||||
|
rowC[0] += result[6] * alpha;
|
||||||
|
#define SAVE4x2_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
|
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] += result[0] * alpha; \
|
||||||
|
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha; \
|
rowC[0] += result[2] * alpha; \
|
||||||
|
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||||
|
rowC[0] += result[4] * alpha; \
|
||||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||||
rowC[0] += result[0] * alpha;
|
rowC[0] += result[6] * alpha;
|
||||||
|
|
||||||
#define MMA __builtin_mma_xvbf16ger2pp
|
#define MMA __builtin_mma_xvbf16ger2pp
|
||||||
|
|
||||||
#define SAVE2x4_ACC(ACC, J) \
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
__builtin_mma_disassemble_acc (result, ACC); \
|
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
rowC[0] += result[3] * alpha; \
|
rowC[0] += result[0] * alpha; \
|
||||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
rowC[0] += result[2] * alpha;
|
rowC[0] += result[1] * alpha;
|
||||||
|
|
||||||
#define SET_ACC_ZERO4() \
|
#define SET_ACC_ZERO4() \
|
||||||
__builtin_mma_xxsetaccz (&acc0); \
|
__builtin_mma_xxsetaccz (&acc0); \
|
||||||
|
|
|
@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#pragma GCC optimize "O1"
|
#pragma GCC optimize "O1"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "srot_microk_power8.c"
|
#include "srot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "sscal_microk_power8.c"
|
#include "sscal_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if !defined(HAVE_KERNEL_16)
|
#if !defined(HAVE_KERNEL_16)
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "sswap_microk_power8.c"
|
#include "sswap_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_32
|
#ifndef HAVE_KERNEL_32
|
||||||
|
|
||||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "zasum_microk_power8.c"
|
#include "zasum_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "zaxpy_microk_power8.c"
|
#include "zaxpy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_4
|
#ifndef HAVE_KERNEL_4
|
||||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "zcopy_microk_power8.c"
|
#include "zcopy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "zdot_microk_power8.c"
|
#include "zdot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
|
@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
|
||||||
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
|
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
|
||||||
|
|
||||||
if ( n <= 0 )
|
if ( n <= 0 )
|
||||||
{
|
{ /*
|
||||||
__real__ result = 0.0 ;
|
__real__ result = 0.0 ;
|
||||||
__imag__ result = 0.0 ;
|
__imag__ result = 0.0 ;
|
||||||
|
*/
|
||||||
|
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||||
return(result);
|
return(result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(CONJ)
|
#if !defined(CONJ)
|
||||||
|
/*
|
||||||
__real__ result = dot[0] - dot[1];
|
__real__ result = dot[0] - dot[1];
|
||||||
__imag__ result = dot[2] + dot[3];
|
__imag__ result = dot[2] + dot[3];
|
||||||
|
*/
|
||||||
|
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
|
||||||
#else
|
#else
|
||||||
|
/*
|
||||||
__real__ result = dot[0] + dot[1];
|
__real__ result = dot[0] + dot[1];
|
||||||
__imag__ result = dot[2] - dot[3];
|
__imag__ result = dot[2] - dot[3];
|
||||||
|
*/
|
||||||
|
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
#define HAVE_KERNEL_4x4_VEC 1
|
#define HAVE_KERNEL_4x4_VEC 1
|
||||||
#define HAVE_KERNEL_4x2_VEC 1
|
#define HAVE_KERNEL_4x2_VEC 1
|
||||||
#define HAVE_KERNEL_4x1_VEC 1
|
#define HAVE_KERNEL_4x1_VEC 1
|
||||||
|
@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
#define NBMAX 4096
|
#define NBMAX 4096
|
||||||
|
|
|
@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#define NBMAX 4096
|
#define NBMAX 4096
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
|
||||||
#define HAVE_KERNEL_4x4_VEC 1
|
#define HAVE_KERNEL_4x4_VEC 1
|
||||||
#define HAVE_KERNEL_4x2_VEC 1
|
#define HAVE_KERNEL_4x2_VEC 1
|
||||||
#define HAVE_KERNEL_4x1_VEC 1
|
#define HAVE_KERNEL_4x1_VEC 1
|
||||||
|
|
||||||
|
#endif
|
||||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||||
|
#include "../arm/zrot.c"
|
||||||
|
#else
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
|
@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#pragma GCC optimize "O1"
|
#pragma GCC optimize "O1"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#if defined(DOUBLE)
|
#if defined(DOUBLE)
|
||||||
#include "zscal_microk_power8.c"
|
#include "zscal_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_8
|
#ifndef HAVE_KERNEL_8
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
#include "zswap_microk_power8.c"
|
#include "zswap_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_16
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
|
@ -1,667 +0,0 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
||||||
/* All rights reserved. */
|
|
||||||
/* */
|
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
|
||||||
/* without modification, are permitted provided that the following */
|
|
||||||
/* conditions are met: */
|
|
||||||
/* */
|
|
||||||
/* 1. Redistributions of source code must retain the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer. */
|
|
||||||
/* */
|
|
||||||
/* 2. Redistributions in binary form must reproduce the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer in the documentation and/or other materials */
|
|
||||||
/* provided with the distribution. */
|
|
||||||
/* */
|
|
||||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
||||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
||||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
||||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
||||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
||||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
||||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
||||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
||||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
||||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
||||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
||||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
||||||
/* */
|
|
||||||
/* The views and conclusions contained in the software and */
|
|
||||||
/* documentation are those of the authors and should not be */
|
|
||||||
/* interpreted as representing official policies, either expressed */
|
|
||||||
/* or implied, of The University of Texas at Austin. */
|
|
||||||
/*********************************************************************/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
|
||||||
|
|
||||||
//The array of job_t may overflow the stack.
|
|
||||||
//Instead, use malloc to alloc job_t.
|
|
||||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
|
||||||
#define USE_ALLOC_HEAP
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
static FLOAT dm1 = -1.;
|
|
||||||
|
|
||||||
#ifndef KERNEL_FUNC
|
|
||||||
#ifndef LOWER
|
|
||||||
#define KERNEL_FUNC SYRK_KERNEL_U
|
|
||||||
#else
|
|
||||||
#define KERNEL_FUNC SYRK_KERNEL_L
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
#ifndef COMPLEX
|
|
||||||
#define TRSM_KERNEL TRSM_KERNEL_LT
|
|
||||||
#else
|
|
||||||
#define TRSM_KERNEL TRSM_KERNEL_LC
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#ifndef COMPLEX
|
|
||||||
#define TRSM_KERNEL TRSM_KERNEL_RN
|
|
||||||
#else
|
|
||||||
#define TRSM_KERNEL TRSM_KERNEL_RR
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef CACHE_LINE_SIZE
|
|
||||||
#define CACHE_LINE_SIZE 8
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef DIVIDE_RATE
|
|
||||||
#define DIVIDE_RATE 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef SWITCH_RATIO
|
|
||||||
#define SWITCH_RATIO 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
#define TRANS
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef SYRK_LOCAL
|
|
||||||
#if !defined(LOWER) && !defined(TRANS)
|
|
||||||
#define SYRK_LOCAL SYRK_UN
|
|
||||||
#elif !defined(LOWER) && defined(TRANS)
|
|
||||||
#define SYRK_LOCAL SYRK_UT
|
|
||||||
#elif defined(LOWER) && !defined(TRANS)
|
|
||||||
#define SYRK_LOCAL SYRK_LN
|
|
||||||
#else
|
|
||||||
#define SYRK_LOCAL SYRK_LT
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
#ifdef HAVE_C11
|
|
||||||
_Atomic
|
|
||||||
#else
|
|
||||||
volatile
|
|
||||||
#endif
|
|
||||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
|
||||||
} job_t;
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef KERNEL_OPERATION
|
|
||||||
#ifndef COMPLEX
|
|
||||||
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
|
||||||
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
|
||||||
#else
|
|
||||||
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
|
||||||
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef ICOPY_OPERATION
|
|
||||||
#ifndef TRANS
|
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
|
||||||
#else
|
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef OCOPY_OPERATION
|
|
||||||
#ifdef TRANS
|
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
|
||||||
#else
|
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef S
|
|
||||||
#define S args -> a
|
|
||||||
#endif
|
|
||||||
#ifndef A
|
|
||||||
#define A args -> b
|
|
||||||
#endif
|
|
||||||
#ifndef C
|
|
||||||
#define C args -> c
|
|
||||||
#endif
|
|
||||||
#ifndef LDA
|
|
||||||
#define LDA args -> lda
|
|
||||||
#endif
|
|
||||||
#ifndef N
|
|
||||||
#define N args -> m
|
|
||||||
#endif
|
|
||||||
#ifndef K
|
|
||||||
#define K args -> k
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
|
||||||
|
|
||||||
FLOAT *buffer[DIVIDE_RATE];
|
|
||||||
|
|
||||||
BLASLONG k, lda;
|
|
||||||
BLASLONG m_from, m_to;
|
|
||||||
|
|
||||||
FLOAT *alpha;
|
|
||||||
FLOAT *a, *c;
|
|
||||||
job_t *job = (job_t *)args -> common;
|
|
||||||
BLASLONG xxx, bufferside;
|
|
||||||
|
|
||||||
BLASLONG jjs, min_jj;
|
|
||||||
BLASLONG is, min_i, div_n;
|
|
||||||
|
|
||||||
BLASLONG i, current;
|
|
||||||
|
|
||||||
k = K;
|
|
||||||
|
|
||||||
a = (FLOAT *)A;
|
|
||||||
c = (FLOAT *)C;
|
|
||||||
|
|
||||||
lda = LDA;
|
|
||||||
|
|
||||||
alpha = (FLOAT *)args -> alpha;
|
|
||||||
|
|
||||||
m_from = range_n[mypos + 0];
|
|
||||||
m_to = range_n[mypos + 1];
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
|
||||||
|
|
||||||
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
|
||||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
|
||||||
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
|
||||||
#else
|
|
||||||
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
|
|
||||||
|
|
||||||
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
|
|
||||||
|
|
||||||
min_jj = MIN(m_to, xxx + div_n) - jjs;
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
|
|
||||||
#else
|
|
||||||
if (min_jj > GEMM_P) min_jj = GEMM_P;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
|
||||||
|
|
||||||
TRSM_KERNEL (k, min_jj, k, dm1,
|
|
||||||
#ifdef COMPLEX
|
|
||||||
ZERO,
|
|
||||||
#endif
|
|
||||||
sb,
|
|
||||||
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
|
||||||
a + jjs * lda * COMPSIZE, lda, 0);
|
|
||||||
#else
|
|
||||||
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
|
||||||
|
|
||||||
TRSM_KERNEL (min_jj, k, k, dm1,
|
|
||||||
#ifdef COMPLEX
|
|
||||||
ZERO,
|
|
||||||
#endif
|
|
||||||
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
|
||||||
sb,
|
|
||||||
a + jjs * COMPSIZE, lda, 0);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
for (i = 0; i <= mypos; i++)
|
|
||||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
|
||||||
#else
|
|
||||||
for (i = mypos; i < args -> nthreads; i++)
|
|
||||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
WMB;
|
|
||||||
}
|
|
||||||
|
|
||||||
min_i = m_to - m_from;
|
|
||||||
|
|
||||||
if (min_i >= GEMM_P * 2) {
|
|
||||||
min_i = GEMM_P;
|
|
||||||
} else
|
|
||||||
if (min_i > GEMM_P) {
|
|
||||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
|
||||||
#else
|
|
||||||
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
current = mypos;
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
while (current < args -> nthreads)
|
|
||||||
#else
|
|
||||||
while (current >= 0)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
|
||||||
|
|
||||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
|
||||||
|
|
||||||
/* thread has to wait */
|
|
||||||
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
|
||||||
|
|
||||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
|
||||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
|
||||||
c, lda, m_from, xxx);
|
|
||||||
|
|
||||||
if (m_from + min_i >= m_to) {
|
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
|
||||||
WMB;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
current ++;
|
|
||||||
#else
|
|
||||||
current --;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
for(is = m_from + min_i; is < m_to; is += min_i){
|
|
||||||
min_i = m_to - is;
|
|
||||||
|
|
||||||
if (min_i >= GEMM_P * 2) {
|
|
||||||
min_i = GEMM_P;
|
|
||||||
} else
|
|
||||||
if (min_i > GEMM_P) {
|
|
||||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
|
||||||
#else
|
|
||||||
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
current = mypos;
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
while (current < args -> nthreads)
|
|
||||||
#else
|
|
||||||
while (current >= 0)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
|
||||||
|
|
||||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
|
||||||
|
|
||||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
|
||||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
|
||||||
c, lda, is, xxx);
|
|
||||||
|
|
||||||
if (is + min_i >= m_to) {
|
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
|
||||||
WMB;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifndef LOWER
|
|
||||||
current ++;
|
|
||||||
#else
|
|
||||||
current --;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
|
||||||
if (i != mypos) {
|
|
||||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|
||||||
|
|
||||||
blas_arg_t newarg;
|
|
||||||
|
|
||||||
#ifndef USE_ALLOC_HEAP
|
|
||||||
job_t job[MAX_CPU_NUMBER];
|
|
||||||
#else
|
|
||||||
job_t * job = NULL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
|
||||||
|
|
||||||
BLASLONG range[MAX_CPU_NUMBER + 100];
|
|
||||||
|
|
||||||
BLASLONG num_cpu;
|
|
||||||
|
|
||||||
BLASLONG nthreads = args -> nthreads;
|
|
||||||
|
|
||||||
BLASLONG width, i, j, k;
|
|
||||||
BLASLONG n, n_from, n_to;
|
|
||||||
int mode, mask;
|
|
||||||
double dnum;
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
|
||||||
#ifdef XDOUBLE
|
|
||||||
mode = BLAS_XDOUBLE | BLAS_REAL;
|
|
||||||
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
|
|
||||||
#elif defined(DOUBLE)
|
|
||||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
|
||||||
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
|
||||||
#elif defined(HALF)
|
|
||||||
mode = BLAS_HALF | BLAS_REAL;
|
|
||||||
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
|
|
||||||
#else
|
|
||||||
mode = BLAS_SINGLE | BLAS_REAL;
|
|
||||||
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#ifdef XDOUBLE
|
|
||||||
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
|
||||||
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
|
|
||||||
#elif defined(DOUBLE)
|
|
||||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
|
||||||
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
|
|
||||||
#else
|
|
||||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
|
||||||
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
newarg.m = args -> m;
|
|
||||||
newarg.k = args -> k;
|
|
||||||
newarg.a = args -> a;
|
|
||||||
newarg.b = args -> b;
|
|
||||||
newarg.c = args -> c;
|
|
||||||
newarg.lda = args -> lda;
|
|
||||||
newarg.alpha = args -> alpha;
|
|
||||||
|
|
||||||
#ifdef USE_ALLOC_HEAP
|
|
||||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
|
||||||
if(job==NULL){
|
|
||||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
newarg.common = (void *)job;
|
|
||||||
|
|
||||||
n_from = 0;
|
|
||||||
n_to = args -> m;
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
|
|
||||||
range[MAX_CPU_NUMBER] = n_to - n_from;
|
|
||||||
range[0] = 0;
|
|
||||||
num_cpu = 0;
|
|
||||||
i = 0;
|
|
||||||
n = n_to - n_from;
|
|
||||||
|
|
||||||
dnum = (double)n * (double)n /(double)nthreads;
|
|
||||||
|
|
||||||
while (i < n){
|
|
||||||
|
|
||||||
if (nthreads - num_cpu > 1) {
|
|
||||||
|
|
||||||
double di = (double)i;
|
|
||||||
|
|
||||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
|
||||||
|
|
||||||
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
|
|
||||||
|
|
||||||
if ((width > n - i) || (width < mask)) width = n - i;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
width = n - i;
|
|
||||||
}
|
|
||||||
|
|
||||||
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
|
|
||||||
|
|
||||||
queue[num_cpu].mode = mode;
|
|
||||||
queue[num_cpu].routine = inner_thread;
|
|
||||||
queue[num_cpu].args = &newarg;
|
|
||||||
queue[num_cpu].range_m = NULL;
|
|
||||||
|
|
||||||
queue[num_cpu].sa = NULL;
|
|
||||||
queue[num_cpu].sb = NULL;
|
|
||||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
|
||||||
|
|
||||||
num_cpu ++;
|
|
||||||
i += width;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
range[0] = 0;
|
|
||||||
num_cpu = 0;
|
|
||||||
i = 0;
|
|
||||||
n = n_to - n_from;
|
|
||||||
|
|
||||||
dnum = (double)n * (double)n /(double)nthreads;
|
|
||||||
|
|
||||||
while (i < n){
|
|
||||||
|
|
||||||
if (nthreads - num_cpu > 1) {
|
|
||||||
|
|
||||||
double di = (double)i;
|
|
||||||
|
|
||||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
|
||||||
|
|
||||||
if ((width > n - i) || (width < mask)) width = n - i;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
width = n - i;
|
|
||||||
}
|
|
||||||
|
|
||||||
range[num_cpu + 1] = range[num_cpu] + width;
|
|
||||||
|
|
||||||
queue[num_cpu].mode = mode;
|
|
||||||
queue[num_cpu].routine = inner_thread;
|
|
||||||
queue[num_cpu].args = &newarg;
|
|
||||||
queue[num_cpu].range_m = NULL;
|
|
||||||
queue[num_cpu].range_n = range;
|
|
||||||
queue[num_cpu].sa = NULL;
|
|
||||||
queue[num_cpu].sb = NULL;
|
|
||||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
|
||||||
|
|
||||||
num_cpu ++;
|
|
||||||
i += width;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
newarg.nthreads = num_cpu;
|
|
||||||
|
|
||||||
if (num_cpu) {
|
|
||||||
|
|
||||||
for (j = 0; j < num_cpu; j++) {
|
|
||||||
for (i = 0; i < num_cpu; i++) {
|
|
||||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
|
||||||
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
queue[0].sa = sa;
|
|
||||||
queue[0].sb = sb;
|
|
||||||
queue[num_cpu - 1].next = NULL;
|
|
||||||
|
|
||||||
exec_blas(num_cpu, queue);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef USE_ALLOC_HEAP
|
|
||||||
free(job);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
|
|
||||||
|
|
||||||
BLASLONG n, bk, i, blocking, lda;
|
|
||||||
BLASLONG info;
|
|
||||||
int mode;
|
|
||||||
blas_arg_t newarg;
|
|
||||||
FLOAT *a;
|
|
||||||
FLOAT alpha[2] = { -ONE, ZERO};
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
|
||||||
#ifdef XDOUBLE
|
|
||||||
mode = BLAS_XDOUBLE | BLAS_REAL;
|
|
||||||
#elif defined(DOUBLE)
|
|
||||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
|
||||||
#else
|
|
||||||
mode = BLAS_SINGLE | BLAS_REAL;
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#ifdef XDOUBLE
|
|
||||||
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
|
||||||
#elif defined(DOUBLE)
|
|
||||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
|
||||||
#else
|
|
||||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (args -> nthreads == 1) {
|
|
||||||
#ifndef LOWER
|
|
||||||
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
|
|
||||||
#else
|
|
||||||
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
|
|
||||||
#endif
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
n = args -> n;
|
|
||||||
a = (FLOAT *)args -> a;
|
|
||||||
lda = args -> lda;
|
|
||||||
|
|
||||||
if (range_n) n = range_n[1] - range_n[0];
|
|
||||||
|
|
||||||
if (n <= GEMM_UNROLL_N * 2) {
|
|
||||||
#ifndef LOWER
|
|
||||||
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
|
|
||||||
#else
|
|
||||||
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
|
|
||||||
#endif
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
newarg.lda = lda;
|
|
||||||
newarg.ldb = lda;
|
|
||||||
newarg.ldc = lda;
|
|
||||||
newarg.alpha = alpha;
|
|
||||||
newarg.beta = NULL;
|
|
||||||
newarg.nthreads = args -> nthreads;
|
|
||||||
|
|
||||||
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
|
||||||
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
|
||||||
|
|
||||||
for (i = 0; i < n; i += blocking) {
|
|
||||||
bk = n - i;
|
|
||||||
if (bk > blocking) bk = blocking;
|
|
||||||
|
|
||||||
newarg.m = bk;
|
|
||||||
newarg.n = bk;
|
|
||||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
|
||||||
|
|
||||||
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
|
|
||||||
if (info) return info + i;
|
|
||||||
|
|
||||||
if (n - i - bk > 0) {
|
|
||||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
|
||||||
newarg.m = n - i - bk;
|
|
||||||
newarg.k = bk;
|
|
||||||
#ifndef LOWER
|
|
||||||
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
|
|
||||||
#else
|
|
||||||
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
|
|
||||||
#endif
|
|
||||||
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
|
||||||
|
|
||||||
thread_driver(&newarg, sa, sb);
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifndef LOWER
|
|
||||||
newarg.m = bk;
|
|
||||||
newarg.n = n - i - bk;
|
|
||||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
|
||||||
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
|
|
||||||
|
|
||||||
gemm_thread_n(mode | BLAS_TRANSA_T,
|
|
||||||
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
|
|
||||||
|
|
||||||
newarg.n = n - i - bk;
|
|
||||||
newarg.k = bk;
|
|
||||||
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
|
|
||||||
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
|
|
||||||
#else
|
|
||||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
|
|
||||||
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
newarg.m = n - i - bk;
|
|
||||||
newarg.n = bk;
|
|
||||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
|
||||||
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
|
|
||||||
|
|
||||||
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
|
|
||||||
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
|
|
||||||
|
|
||||||
newarg.n = n - i - bk;
|
|
||||||
newarg.k = bk;
|
|
||||||
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
|
|
||||||
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
|
|
||||||
#else
|
|
||||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
|
|
||||||
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -101,7 +101,12 @@ static FLOAT dm1 = -1.;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#ifdef HAVE_C11
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||||
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
||||||
|
#elif defined(HALF)
|
||||||
|
mode = BLAS_HALF | BLAS_REAL;
|
||||||
|
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
|
||||||
#else
|
#else
|
||||||
mode = BLAS_SINGLE | BLAS_REAL;
|
mode = BLAS_SINGLE | BLAS_REAL;
|
||||||
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
||||||
|
|
43
param.h
43
param.h
|
@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define GEMM_DEFAULT_OFFSET_A 0
|
#define GEMM_DEFAULT_OFFSET_A 0
|
||||||
#define GEMM_DEFAULT_OFFSET_B 65536
|
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||||
|
#if defined(__32BIT__)
|
||||||
|
#warning using BINARY32==POWER6
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
#else
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
#endif
|
||||||
#define SGEMM_DEFAULT_P 1280UL
|
#define SGEMM_DEFAULT_P 1280UL
|
||||||
#define DGEMM_DEFAULT_P 640UL
|
#define DGEMM_DEFAULT_P 640UL
|
||||||
#define CGEMM_DEFAULT_P 640UL
|
#define CGEMM_DEFAULT_P 640UL
|
||||||
|
@ -2769,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
||||||
#define CGEMM_DEFAULT_R 4096
|
#define CGEMM_DEFAULT_R 4096
|
||||||
#define ZGEMM_DEFAULT_R 4096
|
#define ZGEMM_DEFAULT_R 4096
|
||||||
|
|
||||||
|
#elif defined(THUNDERX3T110)
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_P 128
|
||||||
|
#define DGEMM_DEFAULT_P 320
|
||||||
|
#define CGEMM_DEFAULT_P 128
|
||||||
|
#define ZGEMM_DEFAULT_P 128
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_Q 352
|
||||||
|
#define DGEMM_DEFAULT_Q 128
|
||||||
|
#define CGEMM_DEFAULT_Q 224
|
||||||
|
#define ZGEMM_DEFAULT_Q 112
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_R 4096
|
||||||
|
#define DGEMM_DEFAULT_R 4096
|
||||||
|
#define CGEMM_DEFAULT_R 4096
|
||||||
|
#define ZGEMM_DEFAULT_R 4096
|
||||||
|
|
||||||
#elif defined(NEOVERSEN1)
|
#elif defined(NEOVERSEN1)
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
|
Loading…
Reference in New Issue