Merge pull request #1656 from xianyi/develop
Update the 0.3 branch from develop
This commit is contained in:
commit
3a8f0a6a1f
|
@ -6,12 +6,15 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||||
project(OpenBLAS C ASM)
|
project(OpenBLAS C ASM)
|
||||||
set(OpenBLAS_MAJOR_VERSION 0)
|
set(OpenBLAS_MAJOR_VERSION 0)
|
||||||
set(OpenBLAS_MINOR_VERSION 3)
|
set(OpenBLAS_MINOR_VERSION 3)
|
||||||
set(OpenBLAS_PATCH_VERSION 0.dev)
|
set(OpenBLAS_PATCH_VERSION 1.dev)
|
||||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||||
|
|
||||||
# Adhere to GNU filesystem layout conventions
|
# Adhere to GNU filesystem layout conventions
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
|
|
||||||
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
|
|
||||||
set(OpenBLAS_LIBNAME openblas)
|
set(OpenBLAS_LIBNAME openblas)
|
||||||
|
|
||||||
#######
|
#######
|
||||||
|
@ -20,6 +23,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON
|
||||||
endif()
|
endif()
|
||||||
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
|
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
|
||||||
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
|
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
|
||||||
|
option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF)
|
||||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
|
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
|
||||||
#######
|
#######
|
||||||
if(BUILD_WITHOUT_LAPACK)
|
if(BUILD_WITHOUT_LAPACK)
|
||||||
|
@ -208,6 +212,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||||
|
|
||||||
# Install libraries
|
# Install libraries
|
||||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||||
|
EXPORT "OpenBLASTargets"
|
||||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||||
|
@ -267,3 +272,21 @@ if(PKG_CONFIG_FOUND)
|
||||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
||||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||||
|
set(PN OpenBLAS)
|
||||||
|
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}")
|
||||||
|
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake"
|
||||||
|
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||||
|
VERSION ${${PN}_VERSION}
|
||||||
|
COMPATIBILITY AnyNewerVersion)
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||||
|
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
install(EXPORT "${PN}Targets"
|
||||||
|
NAMESPACE "${PN}::"
|
||||||
|
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
|
||||||
|
|
12
Makefile
12
Makefile
|
@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
||||||
done
|
done
|
||||||
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifdef USE_THREAD
|
ifdef USE_THREAD
|
||||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||||
|
@ -294,9 +297,10 @@ endif
|
||||||
|
|
||||||
lapack-test :
|
lapack-test :
|
||||||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
||||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
|
||||||
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||||
ifneq ($(CROSS), 1)
|
ifneq ($(CROSS), 1)
|
||||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||||
endif
|
endif
|
||||||
|
@ -308,9 +312,9 @@ lapack-runtest:
|
||||||
|
|
||||||
|
|
||||||
blas-test:
|
blas-test:
|
||||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
|
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||||
|
|
||||||
|
|
||||||
dummy :
|
dummy :
|
||||||
|
|
|
@ -98,7 +98,7 @@ endif
|
||||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.0.dev
|
VERSION = 0.3.1.dev
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
|
||||||
# If you want to support multiple architecture in one binary
|
# If you want to support multiple architecture in one binary
|
||||||
# DYNAMIC_ARCH = 1
|
# DYNAMIC_ARCH = 1
|
||||||
|
|
||||||
|
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
|
||||||
|
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
|
||||||
|
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
|
||||||
|
# DYNAMIC_OLDER = 1
|
||||||
|
|
||||||
# C compiler including binary type(32bit / 64bit). Default is gcc.
|
# C compiler including binary type(32bit / 64bit). Default is gcc.
|
||||||
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
|
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
|
||||||
# CC = gcc
|
# CC = gcc
|
||||||
|
@ -55,6 +60,14 @@ VERSION = 0.3.0.dev
|
||||||
# This flag is always set for POWER8. Don't modify the flag
|
# This flag is always set for POWER8. Don't modify the flag
|
||||||
# USE_OPENMP = 1
|
# USE_OPENMP = 1
|
||||||
|
|
||||||
|
# The OpenMP scheduler to use - by default this is "static" and you
|
||||||
|
# will normally not want to change this unless you know that your main
|
||||||
|
# workload will involve tasks that have highly unbalanced running times
|
||||||
|
# for individual threads. Changing away from "static" may also adversely
|
||||||
|
# affect memory access locality in NUMA systems. Setting to "runtime" will
|
||||||
|
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||||
|
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||||
|
|
||||||
# You can define maximum number of threads. Basically it should be
|
# You can define maximum number of threads. Basically it should be
|
||||||
# less than actual number of cores. If you don't specify one, it's
|
# less than actual number of cores. If you don't specify one, it's
|
||||||
# automatically detected by the the script.
|
# automatically detected by the the script.
|
||||||
|
@ -151,8 +164,11 @@ NO_AFFINITY = 1
|
||||||
# CONSISTENT_FPCSR = 1
|
# CONSISTENT_FPCSR = 1
|
||||||
|
|
||||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||||
# in small matrix sizes. The default value is 4.
|
# number of floating point operations necessary for the given problem size, no longer
|
||||||
|
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||||
|
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
|
||||||
|
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||||
|
|
||||||
# If you need santy check by comparing reference BLAS. It'll be very
|
# If you need santy check by comparing reference BLAS. It'll be very
|
||||||
|
|
|
@ -62,6 +62,9 @@ ifeq ($(BINARY), 32)
|
||||||
ifeq ($(TARGET), HASWELL)
|
ifeq ($(TARGET), HASWELL)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET), SKYLAKEX)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
ifeq ($(TARGET), SANDYBRIDGE)
|
ifeq ($(TARGET), SANDYBRIDGE)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
@ -95,6 +98,9 @@ ifeq ($(BINARY), 32)
|
||||||
ifeq ($(TARGET_CORE), HASWELL)
|
ifeq ($(TARGET_CORE), HASWELL)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
@ -141,6 +147,10 @@ ifeq ($(NO_AVX2), 1)
|
||||||
GETARCH_FLAGS += -DNO_AVX2
|
GETARCH_FLAGS += -DNO_AVX2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(NO_AVX512), 1)
|
||||||
|
GETARCH_FLAGS += -DNO_AVX512
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(DEBUG), 1)
|
ifeq ($(DEBUG), 1)
|
||||||
GETARCH_FLAGS += -g
|
GETARCH_FLAGS += -g
|
||||||
endif
|
endif
|
||||||
|
@ -238,7 +248,7 @@ endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
ifndef MACOSX_DEPLOYMENT_TARGET
|
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||||
export MACOSX_DEPLOYMENT_TARGET=10.6
|
export MACOSX_DEPLOYMENT_TARGET=10.8
|
||||||
endif
|
endif
|
||||||
MD5SUM = md5 -r
|
MD5SUM = md5 -r
|
||||||
endif
|
endif
|
||||||
|
@ -462,13 +472,37 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
DYNAMIC_CORE += PENRYN DUNNINGTON
|
||||||
|
endif
|
||||||
|
DYNAMIC_CORE += NEHALEM
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
DYNAMIC_CORE += OPTERON OPTERON_SSE3
|
||||||
|
endif
|
||||||
|
DYNAMIC_CORE += BARCELONA
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
DYNAMIC_CORE += BOBCAT ATOM NANO
|
||||||
|
endif
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
||||||
endif
|
endif
|
||||||
ifneq ($(NO_AVX2), 1)
|
ifneq ($(NO_AVX2), 1)
|
||||||
DYNAMIC_CORE += HASWELL ZEN
|
DYNAMIC_CORE += HASWELL ZEN
|
||||||
endif
|
endif
|
||||||
|
ifneq ($(NO_AVX512), 1)
|
||||||
|
ifneq ($(NO_AVX2), 1)
|
||||||
|
DYNAMIC_CORE += SKYLAKEX
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef DYNAMIC_LIST
|
||||||
|
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
|
||||||
|
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
|
||||||
|
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||||
|
CCOMMON_OPT += $(XCCOMMON_OPT)
|
||||||
|
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||||
|
@ -902,6 +936,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
CCOMMON_OPT += -DDYNAMIC_ARCH
|
CCOMMON_OPT += -DDYNAMIC_ARCH
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
CCOMMON_OPT += -DDYNAMIC_OLDER
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(NO_LAPACK), 1)
|
ifeq ($(NO_LAPACK), 1)
|
||||||
CCOMMON_OPT += -DNO_LAPACK
|
CCOMMON_OPT += -DNO_LAPACK
|
||||||
#Disable LAPACK C interface
|
#Disable LAPACK C interface
|
||||||
|
@ -924,6 +962,10 @@ ifeq ($(NO_AVX2), 1)
|
||||||
CCOMMON_OPT += -DNO_AVX2
|
CCOMMON_OPT += -DNO_AVX2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(NO_AVX512), 1)
|
||||||
|
CCOMMON_OPT += -DNO_AVX512
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
CCOMMON_OPT += -DSMP_SERVER
|
CCOMMON_OPT += -DSMP_SERVER
|
||||||
|
|
||||||
|
@ -1230,6 +1272,7 @@ export MSA_FLAGS
|
||||||
export KERNELDIR
|
export KERNELDIR
|
||||||
export FUNCTION_PROFILE
|
export FUNCTION_PROFILE
|
||||||
export TARGET_CORE
|
export TARGET_CORE
|
||||||
|
export NO_AVX512
|
||||||
|
|
||||||
export SGEMM_UNROLL_M
|
export SGEMM_UNROLL_M
|
||||||
export SGEMM_UNROLL_N
|
export SGEMM_UNROLL_N
|
||||||
|
|
|
@ -8,6 +8,13 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), SKYLAKEX)
|
||||||
|
ifndef NO_AVX512
|
||||||
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
ARFLAGS = -m x64
|
ARFLAGS = -m x64
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -20,6 +20,7 @@ DUNNINGTON
|
||||||
NEHALEM
|
NEHALEM
|
||||||
SANDYBRIDGE
|
SANDYBRIDGE
|
||||||
HASWELL
|
HASWELL
|
||||||
|
SKYLAKEX
|
||||||
ATOM
|
ATOM
|
||||||
|
|
||||||
b)AMD CPU:
|
b)AMD CPU:
|
||||||
|
|
16
c_check
16
c_check
|
@ -201,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||||
$binformat = bin32;
|
$binformat = bin32;
|
||||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||||
|
|
||||||
|
$no_avx512= 0;
|
||||||
|
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||||
|
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||||
|
print $tmpf "int main(void){ __asm__ volatile($code); }\n";
|
||||||
|
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
|
||||||
|
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||||
|
system(@cmd) == 0;
|
||||||
|
if ($? != 0) {
|
||||||
|
$no_avx512 = 1;
|
||||||
|
} else {
|
||||||
|
$no_avx512 = 0;
|
||||||
|
}
|
||||||
|
unlink("tmpf.o");
|
||||||
|
}
|
||||||
|
|
||||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||||
|
|
||||||
$data =~ /globl\s([_\.]*)(.*)/;
|
$data =~ /globl\s([_\.]*)(.*)/;
|
||||||
|
@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||||
|
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||||
|
|
||||||
$os =~ tr/[a-z]/[A-Z]/;
|
$os =~ tr/[a-z]/[A-Z]/;
|
||||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||||
|
|
5
cblas.h
5
cblas.h
|
@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
||||||
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
# OpenBLASConfig.cmake
|
||||||
|
# --------------------
|
||||||
|
#
|
||||||
|
# OpenBLAS cmake module.
|
||||||
|
# This module sets the following variables in your project::
|
||||||
|
#
|
||||||
|
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
|
||||||
|
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
|
||||||
|
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
|
||||||
|
# OpenBLAS_INCLUDE_DIR - same as DIRS
|
||||||
|
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
|
||||||
|
# OpenBLAS_LIBRARY - same as LIBRARIES
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Available components::
|
||||||
|
#
|
||||||
|
## shared - search for only shared library
|
||||||
|
## static - search for only static library
|
||||||
|
# serial - search for unthreaded library
|
||||||
|
# pthread - search for native pthread threaded library
|
||||||
|
# openmp - search for OpenMP threaded library
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Exported targets::
|
||||||
|
#
|
||||||
|
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
|
||||||
|
## target. Target is shared _or_ static, so, for both, use separate, not
|
||||||
|
## overlapping, installations. ::
|
||||||
|
#
|
||||||
|
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Suggested usage::
|
||||||
|
#
|
||||||
|
# find_package(OpenBLAS)
|
||||||
|
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# The following variables can be set to guide the search for this package::
|
||||||
|
#
|
||||||
|
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
|
||||||
|
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
|
||||||
|
# PATH - environment variable, set to bin directory of this package
|
||||||
|
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
|
||||||
|
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
|
||||||
|
|
||||||
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
set(PN OpenBLAS)
|
||||||
|
|
||||||
|
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
|
||||||
|
if(@USE_OPENMP@)
|
||||||
|
set(${PN}_openmp_FOUND 1)
|
||||||
|
elseif(@USE_THREAD@)
|
||||||
|
set(${PN}_pthread_FOUND 1)
|
||||||
|
else()
|
||||||
|
set(${PN}_serial_FOUND 1)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
check_required_components(${PN})
|
||||||
|
|
||||||
|
#-----------------------------------------------------------------------------
|
||||||
|
# Don't include targets if this file is being picked up by another
|
||||||
|
# project which has already built this as a subproject
|
||||||
|
#-----------------------------------------------------------------------------
|
||||||
|
if(NOT TARGET ${PN}::OpenBLAS)
|
||||||
|
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
|
||||||
|
|
||||||
|
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
|
||||||
|
set(${PN}_LIBRARY ${_loc})
|
||||||
|
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
|
||||||
|
set(${PN}_LIBRARIES ${_ill})
|
||||||
|
|
||||||
|
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
|
||||||
|
set(${PN}_INCLUDE_DIR ${_id})
|
||||||
|
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
|
||||||
|
set(${PN}_INCLUDE_DIRS ${_iid})
|
||||||
|
endif()
|
||||||
|
|
|
@ -49,13 +49,27 @@ if (DYNAMIC_ARCH)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (X86_64)
|
if (X86_64)
|
||||||
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
set(DYNAMIC_CORE PRESCOTT CORE2)
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
|
||||||
|
endif ()
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
|
||||||
|
endif ()
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
|
||||||
|
endif ()
|
||||||
if (NOT NO_AVX)
|
if (NOT NO_AVX)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
|
||||||
endif ()
|
endif ()
|
||||||
if (NOT NO_AVX2)
|
if (NOT NO_AVX2)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||||
endif ()
|
endif ()
|
||||||
|
if (NOT NO_AVX512)
|
||||||
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT DYNAMIC_CORE)
|
if (NOT DYNAMIC_CORE)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||||
|
|
||||||
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||||
Name: OpenBLAS
|
Name: OpenBLAS
|
||||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||||
Version: @OPENBLAS_VERSION@
|
Version: @OPENBLAS_VERSION@
|
||||||
|
|
|
@ -33,7 +33,7 @@ endif ()
|
||||||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||||
set(NO_AVX 1)
|
set(NO_AVX 1)
|
||||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
|
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
|
||||||
set(TARGET "NEHALEM")
|
set(TARGET "NEHALEM")
|
||||||
endif ()
|
endif ()
|
||||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||||
|
@ -163,6 +163,9 @@ endif ()
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NO_LAPACK)
|
if (NO_LAPACK)
|
||||||
|
|
|
@ -66,3 +66,12 @@ else()
|
||||||
set(BINARY32 1)
|
set(BINARY32 1)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (X86_64 OR X86)
|
||||||
|
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||||
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
|
||||||
|
if (NO_AVX512 EQUAL 1)
|
||||||
|
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||||
|
endif()
|
||||||
|
file(REMOVE "avx512.tmp" "avx512.o")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
12
common.h
12
common.h
|
@ -642,6 +642,7 @@ void gotoblas_profile_init(void);
|
||||||
void gotoblas_profile_quit(void);
|
void gotoblas_profile_quit(void);
|
||||||
|
|
||||||
#ifdef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
|
|
||||||
#ifndef C_MSVC
|
#ifndef C_MSVC
|
||||||
int omp_in_parallel(void);
|
int omp_in_parallel(void);
|
||||||
int omp_get_num_procs(void);
|
int omp_get_num_procs(void);
|
||||||
|
@ -649,12 +650,21 @@ int omp_get_num_procs(void);
|
||||||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (__STDC_VERSION__ >= 201112L)
|
#if (__STDC_VERSION__ >= 201112L)
|
||||||
|
#if defined(C_GCC) && ( __GNUC__ < 7)
|
||||||
|
// workaround for GCC bug 65467
|
||||||
#ifndef _Atomic
|
#ifndef _Atomic
|
||||||
#define _Atomic volatile
|
#define _Atomic volatile
|
||||||
#endif
|
#endif
|
||||||
#include <stdatomic.h>
|
|
||||||
#endif
|
#endif
|
||||||
|
#include <stdatomic.h>
|
||||||
|
#else
|
||||||
|
#ifndef _Atomic
|
||||||
|
#define _Atomic volatile
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#ifdef __ELF__
|
#ifdef __ELF__
|
||||||
int omp_in_parallel (void) __attribute__ ((weak));
|
int omp_in_parallel (void) __attribute__ ((weak));
|
||||||
|
|
|
@ -51,10 +51,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||||
/* do not restore all register */ \
|
/* do not restore all register */ \
|
||||||
volatile int stack_alloc_size = SIZE; \
|
volatile int stack_alloc_size = SIZE; \
|
||||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
|
||||||
stack_alloc_size = 0; \
|
|
||||||
STACK_ALLOC_PROTECT_SET \
|
STACK_ALLOC_PROTECT_SET \
|
||||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
/* Avoid declaring an array of length 0 */ \
|
||||||
|
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
|
||||||
|
__attribute__((aligned(0x20))); \
|
||||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||||
#else
|
#else
|
||||||
//Original OpenBLAS/GotoBLAS codes.
|
//Original OpenBLAS/GotoBLAS codes.
|
||||||
|
|
|
@ -60,8 +60,13 @@
|
||||||
#endif
|
#endif
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define MB
|
#ifdef __GNUC__
|
||||||
#define WMB
|
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||||
|
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||||
|
#else
|
||||||
|
#define MB do {} while (0)
|
||||||
|
#define WMB do {} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
|
|
3
cpuid.h
3
cpuid.h
|
@ -115,6 +115,7 @@
|
||||||
#define CORE_STEAMROLLER 25
|
#define CORE_STEAMROLLER 25
|
||||||
#define CORE_EXCAVATOR 26
|
#define CORE_EXCAVATOR 26
|
||||||
#define CORE_ZEN 27
|
#define CORE_ZEN 27
|
||||||
|
#define CORE_SKYLAKEX 28
|
||||||
|
|
||||||
#define HAVE_SSE (1 << 0)
|
#define HAVE_SSE (1 << 0)
|
||||||
#define HAVE_SSE2 (1 << 1)
|
#define HAVE_SSE2 (1 << 1)
|
||||||
|
@ -137,6 +138,7 @@
|
||||||
#define HAVE_AVX (1 << 18)
|
#define HAVE_AVX (1 << 18)
|
||||||
#define HAVE_FMA4 (1 << 19)
|
#define HAVE_FMA4 (1 << 19)
|
||||||
#define HAVE_FMA3 (1 << 20)
|
#define HAVE_FMA3 (1 << 20)
|
||||||
|
#define HAVE_AVX512VL (1 << 21)
|
||||||
|
|
||||||
#define CACHE_INFO_L1_I 1
|
#define CACHE_INFO_L1_I 1
|
||||||
#define CACHE_INFO_L1_D 2
|
#define CACHE_INFO_L1_D 2
|
||||||
|
@ -211,5 +213,6 @@ typedef struct {
|
||||||
#define CPUTYPE_STEAMROLLER 49
|
#define CPUTYPE_STEAMROLLER 49
|
||||||
#define CPUTYPE_EXCAVATOR 50
|
#define CPUTYPE_EXCAVATOR 50
|
||||||
#define CPUTYPE_ZEN 51
|
#define CPUTYPE_ZEN 51
|
||||||
|
#define CPUTYPE_SKYLAKEX 52
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
49
cpuid_x86.c
49
cpuid_x86.c
|
@ -50,6 +50,8 @@
|
||||||
#ifdef NO_AVX
|
#ifdef NO_AVX
|
||||||
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
||||||
#define CORE_HASWELL CORE_NEHALEM
|
#define CORE_HASWELL CORE_NEHALEM
|
||||||
|
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
|
||||||
|
#define CORE_SKYLAKEX CORE_NEHALEM
|
||||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||||
|
@ -1299,6 +1301,19 @@ int get_cpuname(void){
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 5:
|
case 5:
|
||||||
|
// Skylake X
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
#endif
|
||||||
case 14:
|
case 14:
|
||||||
// Skylake
|
// Skylake
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
|
@ -1324,6 +1339,23 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 6:
|
||||||
|
switch (model) {
|
||||||
|
case 6: // Cannon Lake
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 9:
|
case 9:
|
||||||
case 8:
|
case 8:
|
||||||
switch (model) {
|
switch (model) {
|
||||||
|
@ -1556,6 +1588,7 @@ static char *cpuname[] = {
|
||||||
"STEAMROLLER",
|
"STEAMROLLER",
|
||||||
"EXCAVATOR",
|
"EXCAVATOR",
|
||||||
"ZEN",
|
"ZEN",
|
||||||
|
"SKYLAKEX"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1610,6 +1643,7 @@ static char *lowercpuname[] = {
|
||||||
"steamroller",
|
"steamroller",
|
||||||
"excavator",
|
"excavator",
|
||||||
"zen",
|
"zen",
|
||||||
|
"skylakex"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
@ -1641,6 +1675,7 @@ static char *corename[] = {
|
||||||
"STEAMROLLER",
|
"STEAMROLLER",
|
||||||
"EXCAVATOR",
|
"EXCAVATOR",
|
||||||
"ZEN",
|
"ZEN",
|
||||||
|
"SKYLAKEX"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1672,6 +1707,7 @@ static char *corename_lower[] = {
|
||||||
"steamroller",
|
"steamroller",
|
||||||
"excavator",
|
"excavator",
|
||||||
"zen",
|
"zen",
|
||||||
|
"skylakex"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1860,6 +1896,19 @@ int get_coretype(void){
|
||||||
else
|
else
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
case 5:
|
case 5:
|
||||||
|
// Skylake X
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return CORE_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CORE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CORE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
#endif
|
||||||
case 14:
|
case 14:
|
||||||
// Skylake
|
// Skylake
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
|
|
|
@ -102,7 +102,13 @@ clean ::
|
||||||
rm -f x*
|
rm -f x*
|
||||||
|
|
||||||
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||||
CEXTRALIB =
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
CEXTRALIB = -lomp
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
# Single real
|
# Single real
|
||||||
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
||||||
|
|
|
@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
|
|
@ -91,11 +91,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
#if __STDC_VERSION__ >= 201112L
|
|
||||||
_Atomic
|
|
||||||
#else
|
|
||||||
volatile
|
volatile
|
||||||
#endif
|
|
||||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
|
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
|
||||||
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
|
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
|
||||||
|
|
||||||
/* Make sure if no one is using workspace */
|
|
||||||
START_RPCC();
|
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
|
||||||
STOP_RPCC(waiting1);
|
|
||||||
|
|
||||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||||
|
|
||||||
/* Fused operation to copy region of B into workspace and apply kernel */
|
/* Fused operation to copy region of B into workspace and apply kernel */
|
||||||
|
@ -391,11 +381,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
|
||||||
|
/* Make sure if no one is using workspace */
|
||||||
|
START_RPCC();
|
||||||
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||||
|
STOP_RPCC(waiting1);
|
||||||
/* Set flag so other threads can access local region of B */
|
/* Set flag so other threads can access local region of B */
|
||||||
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
|
|
||||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||||
WMB;
|
WMB;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Get regions of B from other threads and apply kernel */
|
/* Get regions of B from other threads and apply kernel */
|
||||||
current = mypos;
|
current = mypos;
|
||||||
|
@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Wait until other region of B is initialized */
|
/* Wait until other region of B is initialized */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||||
STOP_RPCC(waiting2);
|
STOP_RPCC(waiting2);
|
||||||
|
|
||||||
/* Apply kernel with local region of A and part of other region of B */
|
/* Apply kernel with local region of A and part of other region of B */
|
||||||
|
@ -430,7 +425,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Clear synchronization flag if this thread is done with other region of B */
|
/* Clear synchronization flag if this thread is done with other region of B */
|
||||||
if (m_to - m_from == min_i) {
|
if (m_to - m_from == min_i) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (current != mypos);
|
} while (current != mypos);
|
||||||
|
@ -472,7 +468,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Clear synchronization flag if this thread is done with region of B */
|
/* Clear synchronization flag if this thread is done with region of B */
|
||||||
if (is + min_i >= m_to) {
|
if (is + min_i >= m_to) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||||
WMB;
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
STOP_RPCC(waiting3);
|
STOP_RPCC(waiting3);
|
||||||
|
@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Clear synchronization flags */
|
/* Clear synchronization flags */
|
||||||
for (i = 0; i < MAX_CPU_NUMBER; i++) {
|
for (i = 0; i < nthreads; i++) {
|
||||||
for (j = 0; j < MAX_CPU_NUMBER; j++) {
|
for (j = 0; j < nthreads; j++) {
|
||||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||||
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,6 +48,10 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#ifndef OMP_SCHED
|
||||||
|
#define OMP_SCHED static
|
||||||
|
#endif
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||||
|
@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(OMP_SCHED)
|
||||||
for (i = 0; i < num; i ++) {
|
for (i = 0; i < num; i ++) {
|
||||||
|
|
||||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
|
|
|
@ -49,6 +49,167 @@
|
||||||
#define EXTERN
|
#define EXTERN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef DYNAMIC_LIST
|
||||||
|
extern gotoblas_t gotoblas_PRESCOTT;
|
||||||
|
|
||||||
|
#ifdef DYN_ATHLON
|
||||||
|
extern gotoblas_t gotoblas_ATHLON;
|
||||||
|
#else
|
||||||
|
#define gotoblas_ATHLON gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_KATMAI
|
||||||
|
extern gotoblas_t gotoblas_KATMAI;
|
||||||
|
#else
|
||||||
|
#define gotoblas_KATMAI gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_BANIAS
|
||||||
|
extern gotoblas_t gotoblas_BANIAS;
|
||||||
|
#else
|
||||||
|
#define gotoblas_BANIAS gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_COPPERMINE
|
||||||
|
extern gotoblas_t gotoblas_COPPERMINE;
|
||||||
|
#else
|
||||||
|
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_NORTHWOOD
|
||||||
|
extern gotoblas_t gotoblas_NORTHWOOD;
|
||||||
|
#else
|
||||||
|
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_CORE2
|
||||||
|
extern gotoblas_t gotoblas_CORE2;
|
||||||
|
#else
|
||||||
|
#define gotoblas_CORE2 gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_NEHALEM
|
||||||
|
extern gotoblas_t gotoblas_NEHALEM;
|
||||||
|
#else
|
||||||
|
#define gotoblas_NEHALEM gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_BARCELONA
|
||||||
|
extern gotoblas_t gotoblas_BARCELONA;
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_BARCELONA gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_BARCELONA gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_ATOM
|
||||||
|
extern gotoblas_t gotoblas_ATOM;
|
||||||
|
elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_ATOM gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_NANO
|
||||||
|
extern gotoblas_t gotoblas_NANO;
|
||||||
|
#else
|
||||||
|
#define gotoblas_NANO gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_PENRYN
|
||||||
|
extern gotoblas_t gotoblas_PENRYN;
|
||||||
|
#else
|
||||||
|
#define gotoblas_PENRYN gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_DUNNINGTON
|
||||||
|
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||||
|
#else
|
||||||
|
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_OPTERON
|
||||||
|
extern gotoblas_t gotoblas_OPTERON;
|
||||||
|
#else
|
||||||
|
#define gotoblas_OPTERON gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_OPTERON_SSE3
|
||||||
|
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||||
|
#else
|
||||||
|
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_BOBCAT
|
||||||
|
extern gotoblas_t gotoblas_BOBCAT;
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_BOBCAT gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_BOBCAT gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_SANDYBRIDGE
|
||||||
|
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_BULLDOZER
|
||||||
|
extern gotoblas_t gotoblas_BULLDOZER;
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_BULLDOZER gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_PILEDRIVER
|
||||||
|
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_STEAMROLLER
|
||||||
|
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_EXCAVATOR
|
||||||
|
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_HASWELL
|
||||||
|
extern gotoblas_t gotoblas_HASWELL;
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_HASWELL gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_ZEN
|
||||||
|
extern gotoblas_t gotoblas_ZEN;
|
||||||
|
#elif defined(DYN_HASWELL)
|
||||||
|
#define gotoblas_ZEN gotoblas_HASWELL
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_ZEN gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_ZEN gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
#ifdef DYN_SKYLAKEX
|
||||||
|
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||||
|
#elif defined(DYN_HASWELL)
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#else // not DYNAMIC_LIST
|
||||||
EXTERN gotoblas_t gotoblas_KATMAI;
|
EXTERN gotoblas_t gotoblas_KATMAI;
|
||||||
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
||||||
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
||||||
|
@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS;
|
||||||
EXTERN gotoblas_t gotoblas_ATHLON;
|
EXTERN gotoblas_t gotoblas_ATHLON;
|
||||||
|
|
||||||
extern gotoblas_t gotoblas_PRESCOTT;
|
extern gotoblas_t gotoblas_PRESCOTT;
|
||||||
|
extern gotoblas_t gotoblas_CORE2;
|
||||||
|
extern gotoblas_t gotoblas_NEHALEM;
|
||||||
|
extern gotoblas_t gotoblas_BARCELONA;
|
||||||
|
#ifdef DYNAMIC_OLDER
|
||||||
extern gotoblas_t gotoblas_ATOM;
|
extern gotoblas_t gotoblas_ATOM;
|
||||||
extern gotoblas_t gotoblas_NANO;
|
extern gotoblas_t gotoblas_NANO;
|
||||||
extern gotoblas_t gotoblas_CORE2;
|
|
||||||
extern gotoblas_t gotoblas_PENRYN;
|
extern gotoblas_t gotoblas_PENRYN;
|
||||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||||
extern gotoblas_t gotoblas_NEHALEM;
|
|
||||||
extern gotoblas_t gotoblas_OPTERON;
|
extern gotoblas_t gotoblas_OPTERON;
|
||||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||||
extern gotoblas_t gotoblas_BARCELONA;
|
|
||||||
extern gotoblas_t gotoblas_BOBCAT;
|
extern gotoblas_t gotoblas_BOBCAT;
|
||||||
|
#else
|
||||||
|
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||||
|
#define gotoblas_NANO gotoblas_NEHALEM
|
||||||
|
#define gotoblas_PENRYN gotoblas_CORE2
|
||||||
|
#define gotoblas_DUNNINGTON gotoblas_CORE2
|
||||||
|
#define gotoblas_OPTERON gotoblas_CORE2
|
||||||
|
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
|
||||||
|
#define gotoblas_BOBCAT gotoblas_CORE2
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
extern gotoblas_t gotoblas_BULLDOZER;
|
extern gotoblas_t gotoblas_BULLDOZER;
|
||||||
|
@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
|
||||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||||
#ifdef NO_AVX2
|
#ifdef NO_AVX2
|
||||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||||
#else
|
#else
|
||||||
extern gotoblas_t gotoblas_HASWELL;
|
extern gotoblas_t gotoblas_HASWELL;
|
||||||
extern gotoblas_t gotoblas_ZEN;
|
extern gotoblas_t gotoblas_ZEN;
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
//Use NEHALEM kernels for sandy bridge
|
//Use NEHALEM kernels for sandy bridge
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||||
|
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||||
|
@ -90,6 +269,7 @@ extern gotoblas_t gotoblas_ZEN;
|
||||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif // DYNAMIC_LIST
|
||||||
|
|
||||||
#define VENDOR_INTEL 1
|
#define VENDOR_INTEL 1
|
||||||
#define VENDOR_AMD 2
|
#define VENDOR_AMD 2
|
||||||
|
@ -284,8 +464,21 @@ static gotoblas_t *get_coretype(void){
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (model == 5) {
|
||||||
|
// Intel Skylake X
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return &gotoblas_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_HASWELL;
|
||||||
|
else {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||||
|
return &gotoblas_NEHALEM;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
//Intel Skylake
|
//Intel Skylake
|
||||||
if (model == 14 || model == 5) {
|
if (model == 14) {
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_HASWELL;
|
return &gotoblas_HASWELL;
|
||||||
else{
|
else{
|
||||||
|
@ -307,6 +500,23 @@ static gotoblas_t *get_coretype(void){
|
||||||
return &gotoblas_NEHALEM;
|
return &gotoblas_NEHALEM;
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
|
case 6:
|
||||||
|
if (model == 6) {
|
||||||
|
// Cannon Lake
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return &gotoblas_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return &gotoblas_HASWELL;
|
||||||
|
#else
|
||||||
|
return &gotblas_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return &gotoblas_NEHALEM;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
case 9:
|
case 9:
|
||||||
case 8:
|
case 8:
|
||||||
if (model == 14 ) { // Kaby Lake
|
if (model == 14 ) { // Kaby Lake
|
||||||
|
@ -445,7 +655,8 @@ static char *corename[] = {
|
||||||
"Haswell",
|
"Haswell",
|
||||||
"Steamroller",
|
"Steamroller",
|
||||||
"Excavator",
|
"Excavator",
|
||||||
"Zen"
|
"Zen",
|
||||||
|
"SkylakeX"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -473,7 +684,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||||
|
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -485,7 +696,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||||
char message[128];
|
char message[128];
|
||||||
//char mname[20];
|
//char mname[20];
|
||||||
|
|
||||||
for ( i=1 ; i <= 23; i++)
|
for ( i=1 ; i <= 24; i++)
|
||||||
{
|
{
|
||||||
if (!strncasecmp(coretype,corename[i],20))
|
if (!strncasecmp(coretype,corename[i],20))
|
||||||
{
|
{
|
||||||
|
@ -503,6 +714,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||||
|
|
||||||
switch (found)
|
switch (found)
|
||||||
{
|
{
|
||||||
|
case 24: return (&gotoblas_SKYLAKEX);
|
||||||
case 23: return (&gotoblas_ZEN);
|
case 23: return (&gotoblas_ZEN);
|
||||||
case 22: return (&gotoblas_EXCAVATOR);
|
case 22: return (&gotoblas_EXCAVATOR);
|
||||||
case 21: return (&gotoblas_STEAMROLLER);
|
case 21: return (&gotoblas_STEAMROLLER);
|
||||||
|
|
|
@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define FIXED_PAGESIZE 4096
|
#define FIXED_PAGESIZE 4096
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef BUFFERS_PER_THREAD
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||||
|
#else
|
||||||
|
#define BUFFERS_PER_THREAD NUM_BUFFERS
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||||
|
|
||||||
#if defined(_MSC_VER) && !defined(__clang__)
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
|
@ -180,7 +188,7 @@ int get_num_procs(void) {
|
||||||
cpu_set_t *cpusetp;
|
cpu_set_t *cpusetp;
|
||||||
size_t size;
|
size_t size;
|
||||||
int ret;
|
int ret;
|
||||||
// int i,n;
|
int i,n;
|
||||||
|
|
||||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||||
#if !defined(OS_LINUX)
|
#if !defined(OS_LINUX)
|
||||||
|
@ -318,6 +326,8 @@ int goto_get_num_procs (void) {
|
||||||
return blas_cpu_number;
|
return blas_cpu_number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void blas_memory_init();
|
||||||
|
|
||||||
void openblas_fork_handler()
|
void openblas_fork_handler()
|
||||||
{
|
{
|
||||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||||
|
@ -329,7 +339,7 @@ void openblas_fork_handler()
|
||||||
// implementation of OpenMP.
|
// implementation of OpenMP.
|
||||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||||
int err;
|
int err;
|
||||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
|
||||||
if(err != 0)
|
if(err != 0)
|
||||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||||
#endif
|
#endif
|
||||||
|
@ -407,16 +417,104 @@ int openblas_get_num_threads(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
struct release_t {
|
|
||||||
void *address;
|
|
||||||
void (*func)(struct release_t *);
|
|
||||||
long attr;
|
|
||||||
};
|
|
||||||
|
|
||||||
int hugetlb_allocated = 0;
|
int hugetlb_allocated = 0;
|
||||||
|
|
||||||
static struct release_t release_info[NUM_BUFFERS];
|
#if defined(OS_WINDOWS)
|
||||||
static int release_pos = 0;
|
#define THREAD_LOCAL __declspec(thread)
|
||||||
|
#define LIKELY_ONE(x) (x)
|
||||||
|
#else
|
||||||
|
#define THREAD_LOCAL __thread
|
||||||
|
#define LIKELY_ONE(x) (__builtin_expect(x, 1))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Stores information about the allocation and how to release it */
|
||||||
|
struct alloc_t {
|
||||||
|
/* Whether this allocation is being used */
|
||||||
|
int used;
|
||||||
|
/* Any special attributes needed when releasing this allocation */
|
||||||
|
int attr;
|
||||||
|
/* Function that can properly release this memory */
|
||||||
|
void (*release_func)(struct alloc_t *);
|
||||||
|
/* Pad to 64-byte alignment */
|
||||||
|
char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Convenience macros for storing release funcs */
|
||||||
|
#define STORE_RELEASE_FUNC(address, func) \
|
||||||
|
if (address != (void *)-1) { \
|
||||||
|
struct alloc_t *alloc_info = (struct alloc_t *)address; \
|
||||||
|
alloc_info->release_func = func; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
|
||||||
|
if (address != (void *)-1) { \
|
||||||
|
struct alloc_t *alloc_info = (struct alloc_t *)address; \
|
||||||
|
alloc_info->release_func = func; \
|
||||||
|
alloc_info->attr = attr; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The number of bytes that will be allocated for each buffer. When allocating
|
||||||
|
memory, we store an alloc_t followed by the actual buffer memory. This means
|
||||||
|
that each allocation always has its associated alloc_t, without the need
|
||||||
|
for an auxiliary tracking structure. */
|
||||||
|
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
|
||||||
|
|
||||||
|
/* Clang supports TLS from version 2.8 */
|
||||||
|
#if defined(__clang__) && __clang_major__ > 2 || \
|
||||||
|
(__clang_minor__ == 2 || __clang_minor__ == 8)
|
||||||
|
#define HAS_COMPILER_TLS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* GCC supports TLS from version 4.1 */
|
||||||
|
#if !defined(__clang__) && defined(__GNUC__) && \
|
||||||
|
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
|
||||||
|
#define HAS_COMPILER_TLS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* MSVC supports TLS from version 2005 */
|
||||||
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
||||||
|
#define HAS_COMPILER_TLS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Versions of XCode before 8 did not properly support TLS */
|
||||||
|
#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
|
||||||
|
#undef HAS_COMPILER_TLS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Android NDK's before version 12b did not support TLS */
|
||||||
|
#if defined(__ANDROID__) && defined(__clang__)
|
||||||
|
#if __has_include(<android/ndk-version.h>)
|
||||||
|
#include <android/ndk-version.h>
|
||||||
|
#endif
|
||||||
|
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
|
||||||
|
defined(__NDK_MINOR__) && \
|
||||||
|
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
|
||||||
|
#undef HAS_COMPILER_TLS
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Holds pointers to allocated memory */
|
||||||
|
#if defined(SMP) && !defined(USE_OPENMP)
|
||||||
|
/* This is the number of threads than can be spawned by the server, which is the
|
||||||
|
server plus the number of threads in the thread pool */
|
||||||
|
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1
|
||||||
|
static int next_memory_table_pos = 0;
|
||||||
|
# if defined(HAS_COMPILER_TLS)
|
||||||
|
/* Use compiler generated thread-local-storage */
|
||||||
|
static int THREAD_LOCAL local_memory_table_pos = 0;
|
||||||
|
# else
|
||||||
|
/* Use system-dependent thread-local-storage */
|
||||||
|
# if defined(OS_WINDOWS)
|
||||||
|
static DWORD local_storage_key;
|
||||||
|
# else
|
||||||
|
static pthread_key_t local_storage_key;
|
||||||
|
# endif /* defined(OS_WINDOWS) */
|
||||||
|
# endif /* defined(HAS_COMPILER_TLS) */
|
||||||
|
#else
|
||||||
|
/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
|
||||||
|
# define MAX_ALLOCATING_THREADS 1
|
||||||
|
#endif /* defined(SMP) && !defined(USE_OPENMP) */
|
||||||
|
static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
|
||||||
|
|
||||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||||
static int hot_alloc = 0;
|
static int hot_alloc = 0;
|
||||||
|
@ -432,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0;
|
||||||
static BLASULONG alloc_lock = 0UL;
|
static BLASULONG alloc_lock = 0UL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Returns a pointer to the start of the per-thread memory allocation data */
|
||||||
|
static __inline struct alloc_t ** get_memory_table() {
|
||||||
|
#if defined(SMP) && !defined(USE_OPENMP)
|
||||||
|
# if !defined(HAS_COMPILER_TLS)
|
||||||
|
# if defined(OS_WINDOWS)
|
||||||
|
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
|
||||||
|
# else
|
||||||
|
int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
|
||||||
|
# endif /* defined(OS_WINDOWS) */
|
||||||
|
# endif /* !defined(HAS_COMPILER_TLS) */
|
||||||
|
if (!local_memory_table_pos) {
|
||||||
|
LOCK_COMMAND(&alloc_lock);
|
||||||
|
local_memory_table_pos = next_memory_table_pos++;
|
||||||
|
if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
|
||||||
|
printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
|
||||||
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
|
# if !defined(HAS_COMPILER_TLS)
|
||||||
|
# if defined(OS_WINDOWS)
|
||||||
|
::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
|
||||||
|
# else
|
||||||
|
pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
|
||||||
|
# endif /* defined(OS_WINDOWS) */
|
||||||
|
# endif /* !defined(HAS_COMPILER_TLS) */
|
||||||
|
}
|
||||||
|
return local_memory_table[local_memory_table_pos];
|
||||||
|
#else
|
||||||
|
return local_memory_table[0];
|
||||||
|
#endif /* defined(SMP) && !defined(USE_OPENMP) */
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ALLOC_MMAP
|
#ifdef ALLOC_MMAP
|
||||||
|
|
||||||
static void alloc_mmap_free(struct release_t *release){
|
static void alloc_mmap_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
if (munmap(alloc_info, allocation_block_size)) {
|
||||||
printf("OpenBLAS : munmap failed\n");
|
printf("OpenBLAS : munmap failed\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -450,28 +578,18 @@ static void *alloc_mmap(void *address){
|
||||||
|
|
||||||
if (address){
|
if (address){
|
||||||
map_address = mmap(address,
|
map_address = mmap(address,
|
||||||
BUFFER_SIZE,
|
allocation_block_size,
|
||||||
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
||||||
} else {
|
} else {
|
||||||
map_address = mmap(address,
|
map_address = mmap(address,
|
||||||
BUFFER_SIZE,
|
allocation_block_size,
|
||||||
MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].func = alloc_mmap_free;
|
|
||||||
release_pos ++;
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
|
@ -524,25 +642,25 @@ static void *alloc_mmap(void *address){
|
||||||
|
|
||||||
if (address){
|
if (address){
|
||||||
/* Just give up use advanced operation */
|
/* Just give up use advanced operation */
|
||||||
map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||||
if (hot_alloc == 0) {
|
if (hot_alloc == 0) {
|
||||||
map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
map_address = mmap(NULL, BUFFER_SIZE * SCALING,
|
map_address = mmap(NULL, allocation_block_size * SCALING,
|
||||||
MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
if (map_address != (void *)-1) {
|
||||||
|
@ -550,7 +668,7 @@ static void *alloc_mmap(void *address){
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
int ret=0;
|
int ret=0;
|
||||||
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
if(ret==-1){
|
if(ret==-1){
|
||||||
int errsv=errno;
|
int errsv=errno;
|
||||||
perror("OpenBLAS alloc_mmap:");
|
perror("OpenBLAS alloc_mmap:");
|
||||||
|
@ -558,7 +676,7 @@ static void *alloc_mmap(void *address){
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -566,7 +684,7 @@ static void *alloc_mmap(void *address){
|
||||||
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
|
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
|
||||||
|
|
||||||
start = (BLASULONG)map_address;
|
start = (BLASULONG)map_address;
|
||||||
current = (SCALING - 1) * BUFFER_SIZE;
|
current = (SCALING - 1) * allocation_block_size;
|
||||||
|
|
||||||
while(current > 0) {
|
while(current > 0) {
|
||||||
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
|
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
|
||||||
|
@ -581,7 +699,7 @@ static void *alloc_mmap(void *address){
|
||||||
best = (BLASULONG)-1;
|
best = (BLASULONG)-1;
|
||||||
best_address = map_address;
|
best_address = map_address;
|
||||||
|
|
||||||
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
|
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
|
||||||
|
|
||||||
current = run_bench(start, allocsize);
|
current = run_bench(start, allocsize);
|
||||||
|
|
||||||
|
@ -597,7 +715,7 @@ static void *alloc_mmap(void *address){
|
||||||
if ((BLASULONG)best_address > (BLASULONG)map_address)
|
if ((BLASULONG)best_address > (BLASULONG)map_address)
|
||||||
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
|
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
|
||||||
|
|
||||||
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
|
munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
|
||||||
|
|
||||||
map_address = best_address;
|
map_address = best_address;
|
||||||
|
|
||||||
|
@ -610,17 +728,7 @@ static void *alloc_mmap(void *address){
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].func = alloc_mmap_free;
|
|
||||||
release_pos ++;
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
}
|
}
|
||||||
|
@ -632,9 +740,9 @@ static void *alloc_mmap(void *address){
|
||||||
|
|
||||||
#ifdef ALLOC_MALLOC
|
#ifdef ALLOC_MALLOC
|
||||||
|
|
||||||
static void alloc_malloc_free(struct release_t *release){
|
static void alloc_malloc_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
free(release -> address);
|
free(alloc_info);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -642,15 +750,11 @@ static void *alloc_malloc(void *address){
|
||||||
|
|
||||||
void *map_address;
|
void *map_address;
|
||||||
|
|
||||||
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
|
map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
|
||||||
|
|
||||||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].func = alloc_malloc_free;
|
|
||||||
release_pos ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
|
|
||||||
|
@ -667,24 +771,20 @@ void *qfree (void *address);
|
||||||
#define QCOMMS 0x2
|
#define QCOMMS 0x2
|
||||||
#define QFAST 0x4
|
#define QFAST 0x4
|
||||||
|
|
||||||
static void alloc_qalloc_free(struct release_t *release){
|
static void alloc_qalloc_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
qfree(release -> address);
|
qfree(alloc_info);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *alloc_qalloc(void *address){
|
static void *alloc_qalloc(void *address){
|
||||||
void *map_address;
|
void *map_address;
|
||||||
|
|
||||||
map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
|
map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
|
||||||
|
|
||||||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].func = alloc_qalloc_free;
|
|
||||||
release_pos ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
|
return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
|
||||||
}
|
}
|
||||||
|
@ -693,9 +793,9 @@ static void *alloc_qalloc(void *address){
|
||||||
|
|
||||||
#ifdef ALLOC_WINDOWS
|
#ifdef ALLOC_WINDOWS
|
||||||
|
|
||||||
static void alloc_windows_free(struct release_t *release){
|
static void alloc_windows_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
|
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -703,17 +803,13 @@ static void *alloc_windows(void *address){
|
||||||
void *map_address;
|
void *map_address;
|
||||||
|
|
||||||
map_address = VirtualAlloc(address,
|
map_address = VirtualAlloc(address,
|
||||||
BUFFER_SIZE,
|
allocation_block_size,
|
||||||
MEM_RESERVE | MEM_COMMIT,
|
MEM_RESERVE | MEM_COMMIT,
|
||||||
PAGE_READWRITE);
|
PAGE_READWRITE);
|
||||||
|
|
||||||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC(map_address, alloc_windows_free);
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].func = alloc_windows_free;
|
|
||||||
release_pos ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
}
|
}
|
||||||
|
@ -725,13 +821,14 @@ static void *alloc_windows(void *address){
|
||||||
#define DEVICEDRIVER_NAME "/dev/mapper"
|
#define DEVICEDRIVER_NAME "/dev/mapper"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void alloc_devicedirver_free(struct release_t *release){
|
static void alloc_devicedirver_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
int attr = alloc_info -> attr;
|
||||||
|
if (munmap(address, allocation_block_size)) {
|
||||||
printf("OpenBLAS : Bugphysarea unmap failed.\n");
|
printf("OpenBLAS : Bugphysarea unmap failed.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (close(release -> attr)) {
|
if (close(attr)) {
|
||||||
printf("OpenBLAS : Bugphysarea close failed.\n");
|
printf("OpenBLAS : Bugphysarea close failed.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -748,17 +845,12 @@ static void *alloc_devicedirver(void *address){
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
map_address = mmap(address, BUFFER_SIZE,
|
map_address = mmap(address, allocation_block_size,
|
||||||
PROT_READ | PROT_WRITE,
|
PROT_READ | PROT_WRITE,
|
||||||
MAP_FILE | MAP_SHARED,
|
MAP_FILE | MAP_SHARED,
|
||||||
fd, 0);
|
fd, 0);
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].attr = fd;
|
|
||||||
release_info[release_pos].func = alloc_devicedirver_free;
|
|
||||||
release_pos ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
}
|
}
|
||||||
|
@ -767,9 +859,9 @@ static void *alloc_devicedirver(void *address){
|
||||||
|
|
||||||
#ifdef ALLOC_SHM
|
#ifdef ALLOC_SHM
|
||||||
|
|
||||||
static void alloc_shm_free(struct release_t *release){
|
static void alloc_shm_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
if (shmdt(release -> address)) {
|
if (shmdt(alloc_info)) {
|
||||||
printf("OpenBLAS : Shared memory unmap failed.\n");
|
printf("OpenBLAS : Shared memory unmap failed.\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -778,22 +870,21 @@ static void *alloc_shm(void *address){
|
||||||
void *map_address;
|
void *map_address;
|
||||||
int shmid;
|
int shmid;
|
||||||
|
|
||||||
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
|
shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
|
||||||
|
|
||||||
map_address = (void *)shmat(shmid, address, 0);
|
map_address = (void *)shmat(shmid, address, 0);
|
||||||
|
|
||||||
if (map_address != (void *)-1){
|
if (map_address != (void *)-1){
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
shmctl(shmid, IPC_RMID, 0);
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
release_info[release_pos].address = map_address;
|
struct alloc_t *alloc_info = (struct alloc_t *)map_address;
|
||||||
release_info[release_pos].attr = shmid;
|
alloc_info->release_func = alloc_shm_free;
|
||||||
release_info[release_pos].func = alloc_shm_free;
|
alloc_info->attr = shmid;
|
||||||
release_pos ++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
|
@ -801,23 +892,23 @@ static void *alloc_shm(void *address){
|
||||||
|
|
||||||
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
|
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
|
||||||
|
|
||||||
static void alloc_hugetlb_free(struct release_t *release){
|
static void alloc_hugetlb_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_AIX)
|
#if defined(OS_LINUX) || defined(OS_AIX)
|
||||||
if (shmdt(release -> address)) {
|
if (shmdt(alloc_info)) {
|
||||||
printf("OpenBLAS : Hugepage unmap failed.\n");
|
printf("OpenBLAS : Hugepage unmap failed.\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __sun__
|
#ifdef __sun__
|
||||||
|
|
||||||
munmap(release -> address, BUFFER_SIZE);
|
munmap(alloc_info, allocation_block_size);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
|
|
||||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -830,7 +921,7 @@ static void *alloc_hugetlb(void *address){
|
||||||
#if defined(OS_LINUX) || defined(OS_AIX)
|
#if defined(OS_LINUX) || defined(OS_AIX)
|
||||||
int shmid;
|
int shmid;
|
||||||
|
|
||||||
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
|
shmid = shmget(IPC_PRIVATE, allocation_block_size,
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
SHM_HUGETLB |
|
SHM_HUGETLB |
|
||||||
#endif
|
#endif
|
||||||
|
@ -843,7 +934,7 @@ static void *alloc_hugetlb(void *address){
|
||||||
map_address = (void *)shmat(shmid, address, SHM_RND);
|
map_address = (void *)shmat(shmid, address, SHM_RND);
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (map_address != (void *)-1){
|
if (map_address != (void *)-1){
|
||||||
|
@ -860,7 +951,7 @@ static void *alloc_hugetlb(void *address){
|
||||||
mha.mha_pagesize = HUGE_PAGESIZE;
|
mha.mha_pagesize = HUGE_PAGESIZE;
|
||||||
memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
|
memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
|
||||||
|
|
||||||
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
|
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
|
@ -884,7 +975,7 @@ static void *alloc_hugetlb(void *address){
|
||||||
}
|
}
|
||||||
|
|
||||||
map_address = (void *)VirtualAlloc(address,
|
map_address = (void *)VirtualAlloc(address,
|
||||||
BUFFER_SIZE,
|
allocation_block_size,
|
||||||
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
|
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
|
||||||
PAGE_READWRITE);
|
PAGE_READWRITE);
|
||||||
|
|
||||||
|
@ -895,11 +986,7 @@ static void *alloc_hugetlb(void *address){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (map_address != (void *)-1){
|
STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].func = alloc_hugetlb_free;
|
|
||||||
release_pos ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
}
|
}
|
||||||
|
@ -911,13 +998,14 @@ static void *alloc_hugetlb(void *address){
|
||||||
|
|
||||||
static int hugetlb_pid = 0;
|
static int hugetlb_pid = 0;
|
||||||
|
|
||||||
static void alloc_hugetlbfile_free(struct release_t *release){
|
static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
|
||||||
|
|
||||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
int attr = alloc_info -> attr;
|
||||||
|
if (munmap(alloc_info, allocation_block_size)) {
|
||||||
printf("OpenBLAS : HugeTLBfs unmap failed.\n");
|
printf("OpenBLAS : HugeTLBfs unmap failed.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (close(release -> attr)) {
|
if (close(attr)) {
|
||||||
printf("OpenBLAS : HugeTLBfs close failed.\n");
|
printf("OpenBLAS : HugeTLBfs close failed.\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -938,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){
|
||||||
|
|
||||||
unlink(filename);
|
unlink(filename);
|
||||||
|
|
||||||
map_address = mmap(address, BUFFER_SIZE,
|
map_address = mmap(address, allocation_block_size,
|
||||||
PROT_READ | PROT_WRITE,
|
PROT_READ | PROT_WRITE,
|
||||||
MAP_SHARED,
|
MAP_SHARED,
|
||||||
fd, 0);
|
fd, 0);
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
|
||||||
release_info[release_pos].address = map_address;
|
|
||||||
release_info[release_pos].attr = fd;
|
|
||||||
release_info[release_pos].func = alloc_hugetlbfile_free;
|
|
||||||
release_pos ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return map_address;
|
return map_address;
|
||||||
}
|
}
|
||||||
|
@ -961,35 +1044,35 @@ static BLASULONG base_address = 0UL;
|
||||||
static BLASULONG base_address = BASE_ADDRESS;
|
static BLASULONG base_address = BASE_ADDRESS;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static volatile struct {
|
#if __STDC_VERSION__ >= 201112L
|
||||||
BLASULONG lock;
|
static _Atomic int memory_initialized = 0;
|
||||||
void *addr;
|
|
||||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
|
||||||
int pos;
|
|
||||||
#endif
|
|
||||||
int used;
|
|
||||||
#ifndef __64BIT__
|
|
||||||
char dummy[48];
|
|
||||||
#else
|
#else
|
||||||
char dummy[40];
|
static volatile int memory_initialized = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} memory[NUM_BUFFERS];
|
|
||||||
|
|
||||||
static int memory_initialized = 0;
|
|
||||||
|
|
||||||
/* Memory allocation routine */
|
/* Memory allocation routine */
|
||||||
/* procpos ... indicates where it comes from */
|
/* procpos ... indicates where it comes from */
|
||||||
/* 0 : Level 3 functions */
|
/* 0 : Level 3 functions */
|
||||||
/* 1 : Level 2 functions */
|
/* 1 : Level 2 functions */
|
||||||
/* 2 : Thread */
|
/* 2 : Thread */
|
||||||
|
|
||||||
|
static void blas_memory_init(){
|
||||||
|
#if defined(SMP) && !defined(USE_OPENMP)
|
||||||
|
next_memory_table_pos = 0;
|
||||||
|
# if !defined(HAS_COMPILER_TLS)
|
||||||
|
# if defined(OS_WINDOWS)
|
||||||
|
local_storage_key = ::TlsAlloc();
|
||||||
|
# else
|
||||||
|
pthread_key_create(&local_storage_key, NULL);
|
||||||
|
# endif /* defined(OS_WINDOWS) */
|
||||||
|
# endif /* defined(HAS_COMPILER_TLS) */
|
||||||
|
#endif /* defined(SMP) && !defined(USE_OPENMP) */
|
||||||
|
memset(local_memory_table, 0, sizeof(local_memory_table));
|
||||||
|
}
|
||||||
|
|
||||||
void *blas_memory_alloc(int procpos){
|
void *blas_memory_alloc(int procpos){
|
||||||
|
|
||||||
int position;
|
int position;
|
||||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
|
||||||
int mypos;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void *map_address;
|
void *map_address;
|
||||||
|
|
||||||
|
@ -1019,24 +1102,17 @@ void *blas_memory_alloc(int procpos){
|
||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
void *(**func)(void *address);
|
void *(**func)(void *address);
|
||||||
|
struct alloc_t * alloc_info;
|
||||||
|
struct alloc_t ** alloc_table;
|
||||||
|
|
||||||
#if defined(USE_OPENMP)
|
if (!LIKELY_ONE(memory_initialized)) {
|
||||||
if (!memory_initialized) {
|
#if defined(SMP) && !defined(USE_OPENMP)
|
||||||
#endif
|
/* Only allow a single thread to initialize memory system */
|
||||||
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
LOCK_COMMAND(&alloc_lock);
|
||||||
|
|
||||||
if (!memory_initialized) {
|
if (!memory_initialized) {
|
||||||
|
|
||||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
|
||||||
for (position = 0; position < NUM_BUFFERS; position ++){
|
|
||||||
memory[position].addr = (void *)0;
|
|
||||||
memory[position].pos = -1;
|
|
||||||
memory[position].used = 0;
|
|
||||||
memory[position].lock = 0;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
blas_memory_init();
|
||||||
#ifdef DYNAMIC_ARCH
|
#ifdef DYNAMIC_ARCH
|
||||||
gotoblas_dynamic_init();
|
gotoblas_dynamic_init();
|
||||||
#endif
|
#endif
|
||||||
|
@ -1057,65 +1133,23 @@ void *blas_memory_alloc(int procpos){
|
||||||
|
|
||||||
memory_initialized = 1;
|
memory_initialized = 1;
|
||||||
|
|
||||||
|
#if defined(SMP) && !defined(USE_OPENMP)
|
||||||
}
|
}
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#if defined(USE_OPENMP)
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("Alloc Start ...\n");
|
printf("Alloc Start ...\n");
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
|
||||||
|
|
||||||
mypos = WhereAmI();
|
|
||||||
|
|
||||||
position = mypos;
|
|
||||||
while (position >= NUM_BUFFERS) position >>= 1;
|
|
||||||
|
|
||||||
do {
|
|
||||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
#else
|
|
||||||
blas_lock(&memory[position].lock);
|
|
||||||
#endif
|
|
||||||
if (!memory[position].used) goto allocation;
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#else
|
|
||||||
blas_unlock(&memory[position].lock);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
position ++;
|
|
||||||
|
|
||||||
} while (position < NUM_BUFFERS);
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
position = 0;
|
position = 0;
|
||||||
|
alloc_table = get_memory_table();
|
||||||
do {
|
do {
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
#else
|
|
||||||
if (!memory[position].used) {
|
|
||||||
blas_lock(&memory[position].lock);
|
|
||||||
#endif
|
|
||||||
if (!memory[position].used) goto allocation;
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#else
|
|
||||||
blas_unlock(&memory[position].lock);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
position ++;
|
position ++;
|
||||||
|
|
||||||
} while (position < NUM_BUFFERS);
|
} while (position < BUFFERS_PER_THREAD);
|
||||||
|
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
|
@ -1125,14 +1159,8 @@ void *blas_memory_alloc(int procpos){
|
||||||
printf(" Position -> %d\n", position);
|
printf(" Position -> %d\n", position);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
memory[position].used = 1;
|
alloc_info = alloc_table[position];
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
if (!alloc_info) {
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#else
|
|
||||||
blas_unlock(&memory[position].lock);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!memory[position].addr) {
|
|
||||||
do {
|
do {
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("Allocation Start : %lx\n", base_address);
|
printf("Allocation Start : %lx\n", base_address);
|
||||||
|
@ -1148,14 +1176,14 @@ void *blas_memory_alloc(int procpos){
|
||||||
|
|
||||||
#ifdef ALLOC_DEVICEDRIVER
|
#ifdef ALLOC_DEVICEDRIVER
|
||||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||||
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
|
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef ALLOC_HUGETLBFILE
|
#ifdef ALLOC_HUGETLBFILE
|
||||||
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
|
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
|
||||||
#ifndef OS_WINDOWS
|
#ifndef OS_WINDOWS
|
||||||
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
|
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1172,89 +1200,44 @@ void *blas_memory_alloc(int procpos){
|
||||||
#endif
|
#endif
|
||||||
if (((BLASLONG) map_address) == -1) base_address = 0UL;
|
if (((BLASLONG) map_address) == -1) base_address = 0UL;
|
||||||
|
|
||||||
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
|
if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
|
||||||
|
|
||||||
} while ((BLASLONG)map_address == -1);
|
} while ((BLASLONG)map_address == -1);
|
||||||
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
alloc_table[position] = alloc_info = map_address;
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
memory[position].addr = map_address;
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
|
||||||
|
|
||||||
if (memory[position].pos == -1) memory[position].pos = mypos;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DYNAMIC_ARCH
|
|
||||||
|
|
||||||
if (memory_initialized == 1) {
|
|
||||||
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
|
|
||||||
if (memory_initialized == 1) {
|
|
||||||
|
|
||||||
if (!gotoblas) gotoblas_dynamic_init();
|
|
||||||
|
|
||||||
memory_initialized = 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("Mapped : %p %3d\n\n",
|
printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
|
||||||
(void *)memory[position].addr, position);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return (void *)memory[position].addr;
|
alloc_info->used = 1;
|
||||||
|
|
||||||
|
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
|
||||||
|
|
||||||
error:
|
error:
|
||||||
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blas_memory_free(void *free_area){
|
void blas_memory_free(void *buffer){
|
||||||
|
#ifdef DEBUG
|
||||||
int position;
|
int position;
|
||||||
|
struct alloc_t ** alloc_table;
|
||||||
|
#endif
|
||||||
|
/* Since we passed an offset pointer to the caller, get back to the actual allocation */
|
||||||
|
struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("Unmapped Start : %p ...\n", free_area);
|
printf("Unmapped Start : %p ...\n", alloc_info);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
position = 0;
|
alloc_info->used = 0;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
|
||||||
position++;
|
|
||||||
|
|
||||||
if (memory[position].addr != free_area) goto error;
|
|
||||||
|
|
||||||
#ifdef DEBUG
|
|
||||||
printf(" Position : %d\n", position);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// arm: ensure all writes are finished before other thread takes this memory
|
|
||||||
WMB;
|
|
||||||
|
|
||||||
memory[position].used = 0;
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("Unmap Succeeded.\n\n");
|
printf("Unmap Succeeded.\n\n");
|
||||||
|
@ -1262,15 +1245,13 @@ void blas_memory_free(void *free_area){
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
error:
|
|
||||||
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
|
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
for (position = 0; position < NUM_BUFFERS; position++)
|
alloc_table = get_memory_table();
|
||||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
for (position = 0; position < BUFFERS_PER_THREAD; position++){
|
||||||
#endif
|
if (alloc_table[position]) {
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1287,16 +1268,20 @@ void blas_memory_free_nolock(void * map_address) {
|
||||||
|
|
||||||
void blas_shutdown(void){
|
void blas_shutdown(void){
|
||||||
|
|
||||||
int pos;
|
int pos, thread;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
BLASFUNC(blas_thread_shutdown)();
|
BLASFUNC(blas_thread_shutdown)();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOCK_COMMAND(&alloc_lock);
|
for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
|
||||||
|
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
|
||||||
for (pos = 0; pos < release_pos; pos ++) {
|
struct alloc_t *alloc_info = local_memory_table[thread][pos];
|
||||||
release_info[pos].func(&release_info[pos]);
|
if (alloc_info) {
|
||||||
|
alloc_info->release_func(alloc_info);
|
||||||
|
alloc_info = (void *)0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SEEK_ADDRESS
|
#ifdef SEEK_ADDRESS
|
||||||
|
@ -1305,17 +1290,6 @@ void blas_shutdown(void){
|
||||||
base_address = BASE_ADDRESS;
|
base_address = BASE_ADDRESS;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (pos = 0; pos < NUM_BUFFERS; pos ++){
|
|
||||||
memory[pos].addr = (void *)0;
|
|
||||||
memory[pos].used = 0;
|
|
||||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
|
||||||
memory[pos].pos = -1;
|
|
||||||
#endif
|
|
||||||
memory[pos].lock = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1339,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
size_t size;
|
size_t size;
|
||||||
BLASULONG buffer;
|
BLASULONG buffer;
|
||||||
|
|
||||||
size = BUFFER_SIZE - PAGESIZE;
|
size = allocation_block_size - PAGESIZE;
|
||||||
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
|
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
|
||||||
|
|
||||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||||
|
@ -1360,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
UNLOCK_COMMAND(&init_lock);
|
UNLOCK_COMMAND(&init_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
|
size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
|
||||||
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
|
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
|
||||||
|
|
||||||
while (size > 0) {
|
while (size > 0) {
|
||||||
|
|
|
@ -167,7 +167,7 @@ int get_L2_size(void){
|
||||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||||
|
|
||||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||||
|
|
||||||
|
@ -251,7 +251,7 @@ int get_L2_size(void){
|
||||||
void blas_set_parameter(void){
|
void blas_set_parameter(void){
|
||||||
|
|
||||||
int factor;
|
int factor;
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||||
int size = 16;
|
int size = 16;
|
||||||
#else
|
#else
|
||||||
int size = get_L2_size();
|
int size = get_L2_size();
|
||||||
|
|
|
@ -128,6 +128,8 @@ so : ../$(LIBSONAME)
|
||||||
|
|
||||||
ifeq ($(OSNAME), Android)
|
ifeq ($(OSNAME), Android)
|
||||||
INTERNALNAME = $(LIBPREFIX).so
|
INTERNALNAME = $(LIBPREFIX).so
|
||||||
|
FEXTRALIB += -lm
|
||||||
|
EXTRALIB += -lm
|
||||||
else
|
else
|
||||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
|
|
17
getarch.c
17
getarch.c
|
@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "HASWELL"
|
#define CORENAME "HASWELL"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_SKYLAKEX
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "SKYLAKEX"
|
||||||
|
#define ARCHCONFIG "-DSKYLAKEX " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||||
|
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
|
||||||
|
#define LIBNAME "skylakex"
|
||||||
|
#define CORENAME "SKYLAKEX"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_ATOM
|
#ifdef FORCE_ATOM
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
@ -1181,9 +1196,7 @@ int main(int argc, char *argv[]){
|
||||||
#elif NO_PARALLEL_MAKE==1
|
#elif NO_PARALLEL_MAKE==1
|
||||||
printf("MAKE += -j 1\n");
|
printf("MAKE += -j 1\n");
|
||||||
#else
|
#else
|
||||||
#ifndef OS_WINDOWS
|
|
||||||
printf("MAKE += -j %d\n", get_num_cores());
|
printf("MAKE += -j %d\n", get_num_cores());
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
|
||||||
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
|
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
|
||||||
|
|
||||||
CSBLAS1OBJS = \
|
CSBLAS1OBJS = \
|
||||||
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
|
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
|
||||||
|
@ -277,7 +277,7 @@ CSBLAS3OBJS = \
|
||||||
cblas_sgeadd.$(SUFFIX)
|
cblas_sgeadd.$(SUFFIX)
|
||||||
|
|
||||||
CDBLAS1OBJS = \
|
CDBLAS1OBJS = \
|
||||||
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
|
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
|
||||||
|
@ -294,7 +294,7 @@ CDBLAS3OBJS += \
|
||||||
cblas_dgeadd.$(SUFFIX)
|
cblas_dgeadd.$(SUFFIX)
|
||||||
|
|
||||||
CCBLAS1OBJS = \
|
CCBLAS1OBJS = \
|
||||||
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||||
cblas_ccopy.$(SUFFIX) \
|
cblas_ccopy.$(SUFFIX) \
|
||||||
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
|
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
|
||||||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||||
|
@ -320,7 +320,7 @@ CCBLAS3OBJS = \
|
||||||
|
|
||||||
|
|
||||||
CZBLAS1OBJS = \
|
CZBLAS1OBJS = \
|
||||||
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||||
cblas_zcopy.$(SUFFIX) \
|
cblas_zcopy.$(SUFFIX) \
|
||||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||||
|
@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
|
||||||
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
|
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
|
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||||
if (incy < 0) y -= (n - 1) * incy;
|
if (incy < 0) y -= (n - 1) * incy;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
if (incx == 0 || incy == 0)
|
//
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
//Temporarily work-around the low performance issue with small imput size &
|
//Temporarily work-around the low performance issue with small imput size &
|
||||||
//multithreads.
|
//multithreads.
|
||||||
if (n <= MULTI_THREAD_MINIMAL)
|
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -44,6 +44,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
|
#define SMP_THRESHOLD_MIN 65536.0
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "QGEMM "
|
#define ERROR_NAME "QGEMM "
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
|
@ -52,6 +53,7 @@
|
||||||
#define ERROR_NAME "SGEMM "
|
#define ERROR_NAME "SGEMM "
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
|
#define SMP_THRESHOLD_MIN 8192.0
|
||||||
#ifndef GEMM3M
|
#ifndef GEMM3M
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "XGEMM "
|
#define ERROR_NAME "XGEMM "
|
||||||
|
@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||||
FLOAT *sa, *sb;
|
FLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads_max;
|
|
||||||
int nthreads_avail;
|
|
||||||
double MNK;
|
double MNK;
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
XFLOAT *sa, *sb;
|
XFLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads_max;
|
|
||||||
int nthreads_avail;
|
|
||||||
double MNK;
|
double MNK;
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||||
|
|
||||||
nthreads_max = num_cpu_avail(3);
|
|
||||||
nthreads_avail = nthreads_max;
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
nthreads_max = 1;
|
args.nthreads = 1;
|
||||||
#else
|
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
|
||||||
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
|
||||||
nthreads_max = 1;
|
|
||||||
#endif
|
|
||||||
args.common = NULL;
|
|
||||||
|
|
||||||
if ( nthreads_max > nthreads_avail )
|
|
||||||
args.nthreads = nthreads_avail;
|
|
||||||
else
|
else
|
||||||
args.nthreads = nthreads_max;
|
args.nthreads = num_cpu_avail(3);
|
||||||
|
args.common = NULL;
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
|
||||||
|
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 1048576 )
|
if (n <= 1048576 )
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
mode |= (trans << BLAS_TRANSA_SHIFT);
|
mode |= (trans << BLAS_TRANSA_SHIFT);
|
||||||
mode |= (side << BLAS_RSIDE_SHIFT);
|
mode |= (side << BLAS_RSIDE_SHIFT);
|
||||||
|
|
||||||
args.nthreads = num_cpu_avail(3);
|
|
||||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
else
|
else
|
||||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
|
else
|
||||||
|
args.nthreads = num_cpu_avail(3);
|
||||||
|
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
|
|
|
@ -41,7 +41,11 @@
|
||||||
#ifdef FUNCTION_PROFILE
|
#ifdef FUNCTION_PROFILE
|
||||||
#include "functable.h"
|
#include "functable.h"
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(Z13)
|
||||||
|
#define MULTI_THREAD_MINIMAL 200000
|
||||||
|
#else
|
||||||
|
#define MULTI_THREAD_MINIMAL 10000
|
||||||
|
#endif
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
|
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
|
||||||
|
@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_NAME;
|
||||||
#else
|
#else
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_CNAME;
|
||||||
#endif
|
#endif
|
||||||
|
@ -86,12 +90,15 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
if (incx == 0 || incy == 0)
|
//
|
||||||
|
//Temporarily work-around the low performance issue with small imput size &
|
||||||
|
//multithreads.
|
||||||
|
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if ( n <= 1048576 )
|
if ( n <= 1048576 )
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy;
|
||||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
if (incx == 0 || incy == 0)
|
if (incx == 0 || incy == 0)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
# Makefile.L3
|
# Makefile.L3
|
||||||
set(USE_TRMM false)
|
set(USE_TRMM false)
|
||||||
|
|
||||||
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen")
|
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
|
||||||
set(USE_TRMM true)
|
set(USE_TRMM true)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), SKYLAKEX)
|
||||||
|
USE_TRMM = 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), ZEN)
|
ifeq ($(CORE), ZEN)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
cmp N, #0
|
cmp N, #0
|
||||||
ble cdot_kernel_L999
|
ble cdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #0
|
# cmp INC_X, #0
|
||||||
beq cdot_kernel_L999
|
# beq cdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_Y, #0
|
# cmp INC_Y, #0
|
||||||
beq cdot_kernel_L999
|
# beq cdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #1
|
cmp INC_X, #1
|
||||||
bne cdot_kernel_S_BEGIN
|
bne cdot_kernel_S_BEGIN
|
||||||
|
|
|
@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
cmp N, #0
|
cmp N, #0
|
||||||
ble ddot_kernel_L999
|
ble ddot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #0
|
# cmp INC_X, #0
|
||||||
beq ddot_kernel_L999
|
# beq ddot_kernel_L999
|
||||||
|
|
||||||
cmp INC_Y, #0
|
# cmp INC_Y, #0
|
||||||
beq ddot_kernel_L999
|
# beq ddot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #1
|
cmp INC_X, #1
|
||||||
bne ddot_kernel_S_BEGIN
|
bne ddot_kernel_S_BEGIN
|
||||||
|
|
|
@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
cmp N, #0
|
cmp N, #0
|
||||||
ble sdot_kernel_L999
|
ble sdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #0
|
# cmp INC_X, #0
|
||||||
beq sdot_kernel_L999
|
# beq sdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_Y, #0
|
# cmp INC_Y, #0
|
||||||
beq sdot_kernel_L999
|
# beq sdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #1
|
cmp INC_X, #1
|
||||||
bne sdot_kernel_S_BEGIN
|
bne sdot_kernel_S_BEGIN
|
||||||
|
|
|
@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
cmp N, #0
|
cmp N, #0
|
||||||
ble zdot_kernel_L999
|
ble zdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #0
|
# cmp INC_X, #0
|
||||||
beq zdot_kernel_L999
|
# beq zdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_Y, #0
|
# cmp INC_Y, #0
|
||||||
beq zdot_kernel_L999
|
# beq zdot_kernel_L999
|
||||||
|
|
||||||
cmp INC_X, #1
|
cmp INC_X, #1
|
||||||
bne zdot_kernel_S_BEGIN
|
bne zdot_kernel_S_BEGIN
|
||||||
|
|
|
@ -233,14 +233,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = casum_compute(n, x, inc_x);
|
asum = casum_compute(n, x, inc_x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -183,14 +183,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
if (n <= 0) return 0;
|
if (n <= 0) return 0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
do_copy(n, x, inc_x, y, inc_y);
|
do_copy(n, x, inc_x, y, inc_y);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -228,14 +228,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = dasum_compute(n, x, inc_x);
|
asum = dasum_compute(n, x, inc_x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -384,14 +384,11 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
|
||||||
RETURN_TYPE dot = 0.0;
|
RETURN_TYPE dot = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
||||||
|
|
|
@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
nrm2 = nrm2_compute(n, x, inc_x);
|
nrm2 = nrm2_compute(n, x, inc_x);
|
||||||
|
|
|
@ -321,14 +321,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
BLASLONG max_index = 0;
|
BLASLONG max_index = 0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
max_index = iamax_compute(n, x, inc_x);
|
max_index = iamax_compute(n, x, inc_x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -330,14 +330,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
BLASLONG max_index = 0;
|
BLASLONG max_index = 0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
max_index = izamax_compute(n, x, inc_x);
|
max_index = izamax_compute(n, x, inc_x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -230,14 +230,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = sasum_compute(n, x, inc_x);
|
asum = sasum_compute(n, x, inc_x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
nrm2_double = nrm2_compute(n, x, inc_x);
|
nrm2_double = nrm2_compute(n, x, inc_x);
|
||||||
|
|
|
@ -230,14 +230,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = zasum_compute(n, x, inc_x);
|
asum = zasum_compute(n, x, inc_x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -317,14 +317,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
CIMAG(zdot) = 0.0;
|
CIMAG(zdot) = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
|
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c
|
||||||
#
|
#
|
||||||
SROTKERNEL = srot.c
|
SROTKERNEL = srot.c
|
||||||
DROTKERNEL = drot.c
|
DROTKERNEL = drot.c
|
||||||
#CROTKERNEL = ../arm/zrot.c
|
CROTKERNEL = zrot.c
|
||||||
ZROTKERNEL = zrot.c
|
ZROTKERNEL = zrot.c
|
||||||
#
|
#
|
||||||
SSCALKERNEL = sscal.c
|
SSCALKERNEL = sscal.c
|
||||||
|
|
|
@ -647,7 +647,9 @@ static int get_l2_size_old(void){
|
||||||
return 6144;
|
return 6144;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
// return 0;
|
||||||
|
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
|
||||||
|
return 256;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){
|
||||||
l2 = BITMASK(ecx, 16, 0xffff);
|
l2 = BITMASK(ecx, 16, 0xffff);
|
||||||
|
|
||||||
#ifndef ARCH_X86
|
#ifndef ARCH_X86
|
||||||
|
if (l2 <= 0) {
|
||||||
|
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
|
||||||
|
return 256;
|
||||||
|
}
|
||||||
return l2;
|
return l2;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -871,6 +877,22 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "SkylakeX\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef OPTERON
|
#ifdef OPTERON
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1 @@
|
||||||
include $(KERNELDIR)/KERNEL.PENRYN
|
include $(KERNELDIR)/KERNEL.PENRYN
|
||||||
SSWAPKERNEL = ../arm/swap.c
|
|
||||||
DSWAPKERNEL = ../arm/swap.c
|
|
||||||
|
|
|
@ -138,6 +138,14 @@
|
||||||
/* INCX != 1 or INCY != 1 */
|
/* INCX != 1 or INCY != 1 */
|
||||||
|
|
||||||
.L14:
|
.L14:
|
||||||
|
cmpl $0, %ebx
|
||||||
|
jne .L141
|
||||||
|
cmpl $0, %ecx
|
||||||
|
jne .L141
|
||||||
|
/* INCX == 0 and INCY == 0 */
|
||||||
|
jmp .L27
|
||||||
|
|
||||||
|
.L141:
|
||||||
movl %edx, %eax
|
movl %edx, %eax
|
||||||
sarl $2, %eax
|
sarl $2, %eax
|
||||||
jle .L28
|
jle .L28
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.HASWELL
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
|
||||||
|
|
||||||
|
|
||||||
|
#DTRMMKERNEL = ../generic/trmmkernel_16x2.c
|
||||||
|
#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S
|
||||||
|
#DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
#DGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
#DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
|
SGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
DGEMM_BETA = ../generic/gemm_beta.c
|
|
@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "caxpy_microk_steamroller-2.c"
|
#include "caxpy_microk_steamroller-2.c"
|
||||||
#elif defined(BULLDOZER)
|
#elif defined(BULLDOZER)
|
||||||
#include "caxpy_microk_bulldozer-2.c"
|
#include "caxpy_microk_bulldozer-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX)
|
||||||
#include "caxpy_microk_haswell-2.c"
|
#include "caxpy_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "caxpy_microk_sandy-2.c"
|
#include "caxpy_microk_sandy-2.c"
|
||||||
|
|
|
@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "cdot_microk_bulldozer-2.c"
|
#include "cdot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
||||||
#include "cdot_microk_steamroller-2.c"
|
#include "cdot_microk_steamroller-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "cdot_microk_haswell-2.c"
|
#include "cdot_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "cdot_microk_sandy-2.c"
|
#include "cdot_microk_sandy-2.c"
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(ZEN)
|
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "cgemv_n_microk_haswell-4.c"
|
#include "cgemv_n_microk_haswell-4.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "cgemv_n_microk_bulldozer-4.c"
|
#include "cgemv_n_microk_bulldozer-4.c"
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(ZEN)
|
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "cgemv_t_microk_haswell-4.c"
|
#include "cgemv_t_microk_haswell-4.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "cgemv_t_microk_bulldozer-4.c"
|
#include "cgemv_t_microk_bulldozer-4.c"
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(ZEN)
|
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "cscal_microk_haswell-2.c"
|
#include "cscal_microk_haswell-2.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||||
#include "cscal_microk_bulldozer-2.c"
|
#include "cscal_microk_bulldozer-2.c"
|
||||||
|
|
|
@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "daxpy_microk_steamroller-2.c"
|
#include "daxpy_microk_steamroller-2.c"
|
||||||
#elif defined(PILEDRIVER)
|
#elif defined(PILEDRIVER)
|
||||||
#include "daxpy_microk_piledriver-2.c"
|
#include "daxpy_microk_piledriver-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "daxpy_microk_haswell-2.c"
|
#include "daxpy_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "daxpy_microk_sandy-2.c"
|
#include "daxpy_microk_sandy-2.c"
|
||||||
|
|
|
@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "ddot_microk_piledriver-2.c"
|
#include "ddot_microk_piledriver-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ddot_microk_nehalem-2.c"
|
#include "ddot_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "ddot_microk_haswell-2.c"
|
#include "ddot_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "ddot_microk_sandy-2.c"
|
#include "ddot_microk_sandy-2.c"
|
||||||
|
@ -169,14 +169,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
FLOAT dot = 0.0;
|
FLOAT dot = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||||
} else {
|
} else {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "dgemv_n_microk_nehalem-4.c"
|
#include "dgemv_n_microk_nehalem-4.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
|
||||||
#include "dgemv_n_microk_haswell-4.c"
|
#include "dgemv_n_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
|
||||||
#include "dgemv_t_microk_haswell-4.c"
|
#include "dgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "dscal_microk_bulldozer-2.c"
|
#include "dscal_microk_bulldozer-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "dscal_microk_sandy-2.c"
|
#include "dscal_microk_sandy-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "dscal_microk_haswell-2.c"
|
#include "dscal_microk_haswell-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dsymv_L_microk_bulldozer-2.c"
|
#include "dsymv_L_microk_bulldozer-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "dsymv_L_microk_haswell-2.c"
|
#include "dsymv_L_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "dsymv_L_microk_sandy-2.c"
|
#include "dsymv_L_microk_sandy-2.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dsymv_U_microk_bulldozer-2.c"
|
#include "dsymv_U_microk_bulldozer-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "dsymv_U_microk_haswell-2.c"
|
#include "dsymv_U_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "dsymv_U_microk_sandy-2.c"
|
#include "dsymv_U_microk_sandy-2.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "saxpy_microk_nehalem-2.c"
|
#include "saxpy_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "saxpy_microk_haswell-2.c"
|
#include "saxpy_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "saxpy_microk_sandy-2.c"
|
#include "saxpy_microk_sandy-2.c"
|
||||||
|
|
|
@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sdot_microk_steamroller-2.c"
|
#include "sdot_microk_steamroller-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "sdot_microk_nehalem-2.c"
|
#include "sdot_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "sdot_microk_haswell-2.c"
|
#include "sdot_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "sdot_microk_sandy-2.c"
|
#include "sdot_microk_sandy-2.c"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sgemv_n_microk_nehalem-4.c"
|
#include "sgemv_n_microk_nehalem-4.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "sgemv_n_microk_sandy-4.c"
|
#include "sgemv_n_microk_sandy-4.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "sgemv_n_microk_haswell-4.c"
|
#include "sgemv_n_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sgemv_t_microk_bulldozer-4.c"
|
#include "sgemv_t_microk_bulldozer-4.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "sgemv_t_microk_sandy-4.c"
|
#include "sgemv_t_microk_sandy-4.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "sgemv_t_microk_haswell-4.c"
|
#include "sgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "ssymv_L_microk_bulldozer-2.c"
|
#include "ssymv_L_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ssymv_L_microk_nehalem-2.c"
|
#include "ssymv_L_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "ssymv_L_microk_haswell-2.c"
|
#include "ssymv_L_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "ssymv_L_microk_sandy-2.c"
|
#include "ssymv_L_microk_sandy-2.c"
|
||||||
|
|
|
@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "ssymv_U_microk_bulldozer-2.c"
|
#include "ssymv_U_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ssymv_U_microk_nehalem-2.c"
|
#include "ssymv_U_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "ssymv_U_microk_haswell-2.c"
|
#include "ssymv_U_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "ssymv_U_microk_sandy-2.c"
|
#include "ssymv_U_microk_sandy-2.c"
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
|
|
@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "zaxpy_microk_bulldozer-2.c"
|
#include "zaxpy_microk_bulldozer-2.c"
|
||||||
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "zaxpy_microk_steamroller-2.c"
|
#include "zaxpy_microk_steamroller-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "zaxpy_microk_haswell-2.c"
|
#include "zaxpy_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "zaxpy_microk_sandy-2.c"
|
#include "zaxpy_microk_sandy-2.c"
|
||||||
|
|
|
@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "zdot_microk_bulldozer-2.c"
|
#include "zdot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
||||||
#include "zdot_microk_steamroller-2.c"
|
#include "zdot_microk_steamroller-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "zdot_microk_haswell-2.c"
|
#include "zdot_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "zdot_microk_sandy-2.c"
|
#include "zdot_microk_sandy-2.c"
|
||||||
|
|
|
@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(ZEN)
|
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "zgemv_n_microk_haswell-4.c"
|
#include "zgemv_n_microk_haswell-4.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "zgemv_n_microk_sandy-4.c"
|
#include "zgemv_n_microk_sandy-4.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "zgemv_t_microk_bulldozer-4.c"
|
#include "zgemv_t_microk_bulldozer-4.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "zgemv_t_microk_haswell-4.c"
|
#include "zgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(ZEN)
|
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "zscal_microk_haswell-2.c"
|
#include "zscal_microk_haswell-2.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||||
#include "zscal_microk_bulldozer-2.c"
|
#include "zscal_microk_bulldozer-2.c"
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue