Merge pull request #1656 from xianyi/develop

Update the 0.3 branch from develop
This commit is contained in:
Martin Kroeker 2018-07-01 11:55:21 +02:00 committed by GitHub
commit 3a8f0a6a1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
162 changed files with 13355 additions and 708 deletions

View File

@ -6,12 +6,15 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM) project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 0.dev) set(OpenBLAS_PATCH_VERSION 1.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions # Adhere to GNU filesystem layout conventions
include(GNUInstallDirs) include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(OpenBLAS_LIBNAME openblas) set(OpenBLAS_LIBNAME openblas)
####### #######
@ -20,6 +23,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON
endif() endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
####### #######
if(BUILD_WITHOUT_LAPACK) if(BUILD_WITHOUT_LAPACK)
@ -208,6 +212,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
# Install libraries # Install libraries
install(TARGETS ${OpenBLAS_LIBNAME} install(TARGETS ${OpenBLAS_LIBNAME}
EXPORT "OpenBLASTargets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
@ -267,3 +272,21 @@ if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif() endif()
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
VERSION ${${PN}_VERSION}
COMPATIBILITY AnyNewerVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake
${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}Targets"
NAMESPACE "${PN}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done done
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
ifeq ($(DYNAMIC_OLDER), 1)
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
endif
endif endif
ifdef USE_THREAD ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
@ -294,9 +297,10 @@ endif
lapack-test : lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1) ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion ) ./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif endif
@ -308,9 +312,9 @@ lapack-runtest:
blas-test: blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
dummy : dummy :

View File

@ -98,7 +98,7 @@ endif
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.0.dev VERSION = 0.3.1.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
# If you want to support multiple architecture in one binary # If you want to support multiple architecture in one binary
# DYNAMIC_ARCH = 1 # DYNAMIC_ARCH = 1
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
# DYNAMIC_OLDER = 1
# C compiler including binary type(32bit / 64bit). Default is gcc. # C compiler including binary type(32bit / 64bit). Default is gcc.
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
# CC = gcc # CC = gcc
@ -55,6 +60,14 @@ VERSION = 0.3.0.dev
# This flag is always set for POWER8. Don't modify the flag # This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1 # USE_OPENMP = 1
# The OpenMP scheduler to use - by default this is "static" and you
# will normally not want to change this unless you know that your main
# workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define maximum number of threads. Basically it should be # You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's # less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script. # automatically detected by the the script.
@ -151,8 +164,11 @@ NO_AFFINITY = 1
# CONSISTENT_FPCSR = 1 # CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading # with single thread. (Actually in recent versions this is a factor proportional to the
# in small matrix sizes. The default value is 4. # number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4 # GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very # If you need santy check by comparing reference BLAS. It'll be very

View File

@ -62,6 +62,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL) ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE) ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -95,6 +98,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL) ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET_CORE), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE) ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -141,6 +147,10 @@ ifeq ($(NO_AVX2), 1)
GETARCH_FLAGS += -DNO_AVX2 GETARCH_FLAGS += -DNO_AVX2
endif endif
ifeq ($(NO_AVX512), 1)
GETARCH_FLAGS += -DNO_AVX512
endif
ifeq ($(DEBUG), 1) ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g GETARCH_FLAGS += -g
endif endif
@ -238,7 +248,7 @@ endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6 export MACOSX_DEPLOYMENT_TARGET=10.8
endif endif
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
@ -462,13 +472,37 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += PENRYN DUNNINGTON
endif
DYNAMIC_CORE += NEHALEM
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += OPTERON OPTERON_SSE3
endif
DYNAMIC_CORE += BARCELONA
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += BOBCAT ATOM NANO
endif
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif endif
ifneq ($(NO_AVX2), 1) ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL ZEN DYNAMIC_CORE += HASWELL ZEN
endif endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX
endif
endif
endif
ifdef DYNAMIC_LIST
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
CCOMMON_OPT += $(XCCOMMON_OPT)
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
endif endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@ -902,6 +936,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH CCOMMON_OPT += -DDYNAMIC_ARCH
endif endif
ifeq ($(DYNAMIC_OLDER), 1)
CCOMMON_OPT += -DDYNAMIC_OLDER
endif
ifeq ($(NO_LAPACK), 1) ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK CCOMMON_OPT += -DNO_LAPACK
#Disable LAPACK C interface #Disable LAPACK C interface
@ -924,6 +962,10 @@ ifeq ($(NO_AVX2), 1)
CCOMMON_OPT += -DNO_AVX2 CCOMMON_OPT += -DNO_AVX2
endif endif
ifeq ($(NO_AVX512), 1)
CCOMMON_OPT += -DNO_AVX512
endif
ifdef SMP ifdef SMP
CCOMMON_OPT += -DSMP_SERVER CCOMMON_OPT += -DSMP_SERVER
@ -1230,6 +1272,7 @@ export MSA_FLAGS
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE
export NO_AVX512
export SGEMM_UNROLL_M export SGEMM_UNROLL_M
export SGEMM_UNROLL_N export SGEMM_UNROLL_N

View File

@ -8,6 +8,13 @@ endif
endif endif
endif endif
ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
endif
endif
ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64 ARFLAGS = -m x64
endif endif

View File

@ -20,6 +20,7 @@ DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE SANDYBRIDGE
HASWELL HASWELL
SKYLAKEX
ATOM ATOM
b)AMD CPU: b)AMD CPU:

16
c_check
View File

@ -201,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32; $binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/); $binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "int main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/; $data =~ /globl\s([_\.]*)(.*)/;
@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
$os =~ tr/[a-z]/[A-Z]/; $os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/;

View File

@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@ -0,0 +1,79 @@
# OpenBLASConfig.cmake
# --------------------
#
# OpenBLAS cmake module.
# This module sets the following variables in your project::
#
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
# OpenBLAS_INCLUDE_DIR - same as DIRS
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
# OpenBLAS_LIBRARY - same as LIBRARIES
#
#
# Available components::
#
## shared - search for only shared library
## static - search for only static library
# serial - search for unthreaded library
# pthread - search for native pthread threaded library
# openmp - search for OpenMP threaded library
#
#
# Exported targets::
#
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
## target. Target is shared _or_ static, so, for both, use separate, not
## overlapping, installations. ::
#
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
#
#
# Suggested usage::
#
# find_package(OpenBLAS)
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
#
#
# The following variables can be set to guide the search for this package::
#
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
# PATH - environment variable, set to bin directory of this package
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
@PACKAGE_INIT@
set(PN OpenBLAS)
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
if(@USE_OPENMP@)
set(${PN}_openmp_FOUND 1)
elseif(@USE_THREAD@)
set(${PN}_pthread_FOUND 1)
else()
set(${PN}_serial_FOUND 1)
endif()
check_required_components(${PN})
#-----------------------------------------------------------------------------
# Don't include targets if this file is being picked up by another
# project which has already built this as a subproject
#-----------------------------------------------------------------------------
if(NOT TARGET ${PN}::OpenBLAS)
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
set(${PN}_LIBRARY ${_loc})
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
set(${PN}_LIBRARIES ${_ill})
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIR ${_id})
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIRS ${_iid})
endif()

View File

@ -49,13 +49,27 @@ if (DYNAMIC_ARCH)
endif () endif ()
if (X86_64) if (X86_64)
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) set(DYNAMIC_CORE PRESCOTT CORE2)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
endif ()
if (NOT NO_AVX) if (NOT NO_AVX)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
endif () endif ()
if (NOT NO_AVX2) if (NOT NO_AVX2)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif () endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
endif () endif ()
if (NOT DYNAMIC_CORE) if (NOT DYNAMIC_CORE)

View File

@ -1,7 +1,7 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@ Version: @OPENBLAS_VERSION@

View File

@ -33,7 +33,7 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.") message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1) set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
set(TARGET "NEHALEM") set(TARGET "NEHALEM")
endif () endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@ -163,6 +163,9 @@ endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
endif () endif ()
if (NO_LAPACK) if (NO_LAPACK)

View File

@ -66,3 +66,12 @@ else()
set(BINARY32 1) set(BINARY32 1)
endif() endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
endif()

View File

@ -642,6 +642,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void); void gotoblas_profile_quit(void);
#ifdef USE_OPENMP #ifdef USE_OPENMP
#ifndef C_MSVC #ifndef C_MSVC
int omp_in_parallel(void); int omp_in_parallel(void);
int omp_get_num_procs(void); int omp_get_num_procs(void);
@ -649,12 +650,21 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif #endif
#if (__STDC_VERSION__ >= 201112L) #if (__STDC_VERSION__ >= 201112L)
#if defined(C_GCC) && ( __GNUC__ < 7)
// workaround for GCC bug 65467
#ifndef _Atomic #ifndef _Atomic
#define _Atomic volatile #define _Atomic volatile
#endif #endif
#include <stdatomic.h>
#endif #endif
#include <stdatomic.h>
#else
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#else #else
#ifdef __ELF__ #ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak)); int omp_in_parallel (void) __attribute__ ((weak));

View File

@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* - large enough to support all architectures and kernel * - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing. * Chosing a too small SIZE will lead to a stack smashing.
*/ */
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \ /* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \ /* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \ volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
stack_alloc_size = 0; \ STACK_ALLOC_PROTECT_SET \
STACK_ALLOC_PROTECT_SET \ /* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else #else
//Original OpenBLAS/GotoBLAS codes. //Original OpenBLAS/GotoBLAS codes.

View File

@ -60,8 +60,13 @@
#endif #endif
*/ */
#define MB #ifdef __GNUC__
#define WMB #define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif
static void __inline blas_lock(volatile BLASULONG *address){ static void __inline blas_lock(volatile BLASULONG *address){

View File

@ -115,6 +115,7 @@
#define CORE_STEAMROLLER 25 #define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26 #define CORE_EXCAVATOR 26
#define CORE_ZEN 27 #define CORE_ZEN 27
#define CORE_SKYLAKEX 28
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -137,6 +138,7 @@
#define HAVE_AVX (1 << 18) #define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19) #define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20) #define HAVE_FMA3 (1 << 20)
#define HAVE_AVX512VL (1 << 21)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@ -211,5 +213,6 @@ typedef struct {
#define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51 #define CPUTYPE_ZEN 51
#define CPUTYPE_SKYLAKEX 52
#endif #endif

View File

@ -50,6 +50,8 @@
#ifdef NO_AVX #ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM #define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
#define CORE_SKYLAKEX CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
@ -1299,6 +1301,19 @@ int get_cpuname(void){
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 5: case 5:
// Skylake X
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
#endif
case 14: case 14:
// Skylake // Skylake
if(support_avx()) if(support_avx())
@ -1324,6 +1339,23 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 6:
switch (model) {
case 6: // Cannon Lake
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
#endif
}
break;
case 9: case 9:
case 8: case 8:
switch (model) { switch (model) {
@ -1556,6 +1588,7 @@ static char *cpuname[] = {
"STEAMROLLER", "STEAMROLLER",
"EXCAVATOR", "EXCAVATOR",
"ZEN", "ZEN",
"SKYLAKEX"
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1610,6 +1643,7 @@ static char *lowercpuname[] = {
"steamroller", "steamroller",
"excavator", "excavator",
"zen", "zen",
"skylakex"
}; };
static char *corename[] = { static char *corename[] = {
@ -1641,6 +1675,7 @@ static char *corename[] = {
"STEAMROLLER", "STEAMROLLER",
"EXCAVATOR", "EXCAVATOR",
"ZEN", "ZEN",
"SKYLAKEX"
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1672,6 +1707,7 @@ static char *corename_lower[] = {
"steamroller", "steamroller",
"excavator", "excavator",
"zen", "zen",
"skylakex"
}; };
@ -1860,6 +1896,19 @@ int get_coretype(void){
else else
return CORE_NEHALEM; return CORE_NEHALEM;
case 5: case 5:
// Skylake X
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
case 14: case 14:
// Skylake // Skylake
if(support_avx()) if(support_avx())

View File

@ -102,7 +102,13 @@ clean ::
rm -f x* rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB = ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif
# Single real # Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

View File

@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)

View File

@ -91,11 +91,7 @@
#endif #endif
typedef struct { typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;
@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING) #if defined(FUSED_GEMM) && !defined(TIMING)
/* Fused operation to copy region of B into workspace and apply kernel */ /* Fused operation to copy region of B into workspace and apply kernel */
@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} }
#endif #endif
/* Set flag so other threads can access local region of B */ for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) /* Make sure if no one is using workspace */
START_RPCC();
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
/* Set flag so other threads can access local region of B */
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB; WMB;
}
} }
/* Get regions of B from other threads and apply kernel */ /* Get regions of B from other threads and apply kernel */
@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */ /* Wait until other region of B is initialized */
START_RPCC(); START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
/* Apply kernel with local region of A and part of other region of B */ /* Apply kernel with local region of A and part of other region of B */
@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */ /* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
} }
} }
} while (current != mypos); } while (current != mypos);
/* Iterate through steps of m /* Iterate through steps of m
* Note: First step has already been finished */ * Note: First step has already been finished */
for(is = m_from + min_i; is < m_to; is += min_i){ for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is; min_i = m_to - is;
@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, js); c, ldc, is, js);
STOP_RPCC(kernel); STOP_RPCC(kernel);
#ifdef TIMING #ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
#endif #endif
/* Clear synchronization flag if this thread is done with region of B */ /* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) { if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB; WMB;
} }
} }
@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) { for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
} }
} }
STOP_RPCC(waiting3); STOP_RPCC(waiting3);
@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
} }
/* Clear synchronization flags */ /* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) { for (i = 0; i < nthreads; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) { for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) { for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0; job[i].working[j][CACHE_LINE_SIZE * k] = 0;
} }

View File

@ -48,6 +48,10 @@
#else #else
#ifndef OMP_SCHED
#define OMP_SCHED static
#endif
int blas_server_avail = 0; int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break; break;
} }
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) { for (i = 0; i < num; i ++) {
#ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef USE_SIMPLE_THREADED_LEVEL3

View File

@ -49,6 +49,167 @@
#define EXTERN #define EXTERN
#endif #endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_PRESCOTT;
#ifdef DYN_ATHLON
extern gotoblas_t gotoblas_ATHLON;
#else
#define gotoblas_ATHLON gotoblas_PRESCOTT
#endif
#ifdef DYN_KATMAI
extern gotoblas_t gotoblas_KATMAI;
#else
#define gotoblas_KATMAI gotoblas_PRESCOTT
#endif
#ifdef DYN_BANIAS
extern gotoblas_t gotoblas_BANIAS;
#else
#define gotoblas_BANIAS gotoblas_PRESCOTT
#endif
#ifdef DYN_COPPERMINE
extern gotoblas_t gotoblas_COPPERMINE;
#else
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
#endif
#ifdef DYN_NORTHWOOD
extern gotoblas_t gotoblas_NORTHWOOD;
#else
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
#endif
#ifdef DYN_CORE2
extern gotoblas_t gotoblas_CORE2;
#else
#define gotoblas_CORE2 gotoblas_PRESCOTT
#endif
#ifdef DYN_NEHALEM
extern gotoblas_t gotoblas_NEHALEM;
#else
#define gotoblas_NEHALEM gotoblas_PRESCOTT
#endif
#ifdef DYN_BARCELONA
extern gotoblas_t gotoblas_BARCELONA;
#elif defined(DYN_NEHALEM)
#define gotoblas_BARCELONA gotoblas_NEHALEM
#else
#define gotoblas_BARCELONA gotoblas_PRESCOTT
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
#endif
#ifdef DYN_NANO
extern gotoblas_t gotoblas_NANO;
#else
#define gotoblas_NANO gotoblas_PRESCOTT
#endif
#ifdef DYN_PENRYN
extern gotoblas_t gotoblas_PENRYN;
#else
#define gotoblas_PENRYN gotoblas_PRESCOTT
#endif
#ifdef DYN_DUNNINGTON
extern gotoblas_t gotoblas_DUNNINGTON;
#else
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON
extern gotoblas_t gotoblas_OPTERON;
#else
#define gotoblas_OPTERON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON_SSE3
extern gotoblas_t gotoblas_OPTERON_SSE3;
#else
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
#endif
#ifdef DYN_BOBCAT
extern gotoblas_t gotoblas_BOBCAT;
#elif defined(DYN_NEHALEM)
#define gotoblas_BOBCAT gotoblas_NEHALEM
#else
#define gotoblas_BOBCAT gotoblas_PRESCOTT
#endif
#ifdef DYN_SANDYBRIDGE
extern gotoblas_t gotoblas_SANDYBRIDGE;
#elif defined(DYN_NEHALEM)
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#else
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
#endif
#ifdef DYN_BULLDOZER
extern gotoblas_t gotoblas_BULLDOZER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_BULLDOZER gotoblas_NEHALEM
#else
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
#endif
#ifdef DYN_PILEDRIVER
extern gotoblas_t gotoblas_PILEDRIVER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
#else
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
#endif
#ifdef DYN_STEAMROLLER
extern gotoblas_t gotoblas_STEAMROLLER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
#else
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
#endif
#ifdef DYN_EXCAVATOR
extern gotoblas_t gotoblas_EXCAVATOR;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
#else
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
#endif
#ifdef DYN_HASWELL
extern gotoblas_t gotoblas_HASWELL;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_HASWELL gotoblas_NEHALEM
#else
#define gotoblas_HASWELL gotoblas_PRESCOTT
#endif
#ifdef DYN_ZEN
extern gotoblas_t gotoblas_ZEN;
#elif defined(DYN_HASWELL)
#define gotoblas_ZEN gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_ZEN gotoblas_NEHALEM
#else
#define gotoblas_ZEN gotoblas_PRESCOTT
#endif
#ifdef DYN_SKYLAKEX
extern gotoblas_t gotoblas_SKYLAKEX;
#elif defined(DYN_HASWELL)
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
EXTERN gotoblas_t gotoblas_KATMAI; EXTERN gotoblas_t gotoblas_KATMAI;
EXTERN gotoblas_t gotoblas_COPPERMINE; EXTERN gotoblas_t gotoblas_COPPERMINE;
EXTERN gotoblas_t gotoblas_NORTHWOOD; EXTERN gotoblas_t gotoblas_NORTHWOOD;
@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS;
EXTERN gotoblas_t gotoblas_ATHLON; EXTERN gotoblas_t gotoblas_ATHLON;
extern gotoblas_t gotoblas_PRESCOTT; extern gotoblas_t gotoblas_PRESCOTT;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_BARCELONA;
#ifdef DYNAMIC_OLDER
extern gotoblas_t gotoblas_ATOM; extern gotoblas_t gotoblas_ATOM;
extern gotoblas_t gotoblas_NANO; extern gotoblas_t gotoblas_NANO;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_PENRYN; extern gotoblas_t gotoblas_PENRYN;
extern gotoblas_t gotoblas_DUNNINGTON; extern gotoblas_t gotoblas_DUNNINGTON;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_BOBCAT;
#else
#define gotoblas_ATOM gotoblas_NEHALEM
#define gotoblas_NANO gotoblas_NEHALEM
#define gotoblas_PENRYN gotoblas_CORE2
#define gotoblas_DUNNINGTON gotoblas_CORE2
#define gotoblas_OPTERON gotoblas_CORE2
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
#define gotoblas_BOBCAT gotoblas_CORE2
#endif
#ifndef NO_AVX #ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_BULLDOZER;
@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR; extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2 #ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else #else
extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN; extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif #endif
#else #else
//Use NEHALEM kernels for sandy bridge //Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA
@ -90,6 +269,7 @@ extern gotoblas_t gotoblas_ZEN;
#define gotoblas_ZEN gotoblas_BARCELONA #define gotoblas_ZEN gotoblas_BARCELONA
#endif #endif
#endif // DYNAMIC_LIST
#define VENDOR_INTEL 1 #define VENDOR_INTEL 1
#define VENDOR_AMD 2 #define VENDOR_AMD 2
@ -284,8 +464,21 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
if (model == 5) {
// Intel Skylake X
#ifndef NO_AVX512
return &gotoblas_SKYLAKEX;
#else
if(support_avx())
return &gotoblas_HASWELL;
else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
#endif
}
//Intel Skylake //Intel Skylake
if (model == 14 || model == 5) { if (model == 14) {
if(support_avx()) if(support_avx())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ else{
@ -307,6 +500,23 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; return &gotoblas_NEHALEM;
} }
return NULL; return NULL;
case 6:
if (model == 6) {
// Cannon Lake
#ifndef NO_AVX512
return &gotoblas_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return &gotoblas_HASWELL;
#else
return &gotblas_SANDYBRIDGE;
#endif
else
return &gotoblas_NEHALEM;
#endif
}
return NULL;
case 9: case 9:
case 8: case 8:
if (model == 14 ) { // Kaby Lake if (model == 14 ) { // Kaby Lake
@ -445,7 +655,8 @@ static char *corename[] = {
"Haswell", "Haswell",
"Steamroller", "Steamroller",
"Excavator", "Excavator",
"Zen" "Zen",
"SkylakeX"
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -473,7 +684,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0]; return corename[0];
} }
@ -485,7 +696,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128]; char message[128];
//char mname[20]; //char mname[20];
for ( i=1 ; i <= 23; i++) for ( i=1 ; i <= 24; i++)
{ {
if (!strncasecmp(coretype,corename[i],20)) if (!strncasecmp(coretype,corename[i],20))
{ {
@ -503,6 +714,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found) switch (found)
{ {
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN); case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR); case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER); case 21: return (&gotoblas_STEAMROLLER);

File diff suppressed because it is too large Load Diff

View File

@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){ void blas_set_parameter(void){
int factor; int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
int size = 16; int size = 16;
#else #else
int size = get_L2_size(); int size = get_L2_size();

View File

@ -128,6 +128,8 @@ so : ../$(LIBSONAME)
ifeq ($(OSNAME), Android) ifeq ($(OSNAME), Android)
INTERNALNAME = $(LIBPREFIX).so INTERNALNAME = $(LIBPREFIX).so
FEXTRALIB += -lm
EXTRALIB += -lm
else else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif

View File

@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "HASWELL" #define CORENAME "HASWELL"
#endif #endif
#ifdef FORCE_SKYLAKEX
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SKYLAKEX"
#define ARCHCONFIG "-DSKYLAKEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -1181,9 +1196,7 @@ int main(int argc, char *argv[]){
#elif NO_PARALLEL_MAKE==1 #elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n"); printf("MAKE += -j 1\n");
#else #else
#ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores()); printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif #endif
break; break;

View File

@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
CSBLAS1OBJS = \ CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
@ -277,7 +277,7 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(SUFFIX)
CDBLAS1OBJS = \ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
@ -294,7 +294,7 @@ CDBLAS3OBJS += \
cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(SUFFIX)
CCBLAS1OBJS = \ CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_ccopy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
@ -320,7 +320,7 @@ CCBLAS3OBJS = \
CZBLAS1OBJS = \ CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)

View File

@ -40,11 +40,11 @@
#include "common.h" #include "common.h"
#ifdef FUNCTION_PROFILE #ifdef FUNCTION_PROFILE
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(Z13) #if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000 #define MULTI_THREAD_MINIMAL 200000
#else #else
#define MULTI_THREAD_MINIMAL 10000 #define MULTI_THREAD_MINIMAL 10000
#endif #endif
#ifndef CBLAS #ifndef CBLAS
@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (incy < 0) y -= (n - 1) * incy; if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
if (incx == 0 || incy == 0) //
nthreads = 1;
//Temporarily work-around the low performance issue with small imput size & //Temporarily work-around the low performance issue with small imput size &
//multithreads. //multithreads.
if (n <= MULTI_THREAD_MINIMAL) if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -44,6 +44,7 @@
#endif #endif
#ifndef COMPLEX #ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "QGEMM " #define ERROR_NAME "QGEMM "
#elif defined(DOUBLE) #elif defined(DOUBLE)
@ -52,6 +53,7 @@
#define ERROR_NAME "SGEMM " #define ERROR_NAME "SGEMM "
#endif #endif
#else #else
#define SMP_THRESHOLD_MIN 8192.0
#ifndef GEMM3M #ifndef GEMM3M
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "XGEMM " #define ERROR_NAME "XGEMM "
@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK; double MNK;
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
XFLOAT *sa, *sb; XFLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK; double MNK;
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT);
nthreads_max = num_cpu_avail(3);
nthreads_avail = nthreads_max;
#ifndef COMPLEX
MNK = (double) args.m * (double) args.n * (double) args.k; MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1; args.nthreads = 1;
#else
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#endif
args.common = NULL;
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else else
args.nthreads = nthreads_max; args.nthreads = num_cpu_avail(3);
args.common = NULL;
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif

View File

@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
if (n <= 1048576 ) if (n <= 1048576 )
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT);
args.nthreads = num_cpu_avail(3);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);
if (args.nthreads == 1) { if (args.nthreads == 1) {

View File

@ -41,7 +41,11 @@
#ifdef FUNCTION_PROFILE #ifdef FUNCTION_PROFILE
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000
#else
#define MULTI_THREAD_MINIMAL 10000
#endif
#ifndef CBLAS #ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
#endif #endif
#ifndef CBLAS #ifndef CBLAS
PRINT_DEBUG_CNAME; PRINT_DEBUG_NAME;
#else #else
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
#endif #endif
@ -86,12 +90,15 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
if (incy < 0) y -= (n - 1) * incy * 2; if (incy < 0) y -= (n - 1) * incy * 2;
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
if (incx == 0 || incy == 0) //
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
if ( n <= 1048576 ) if ( n <= 1048576 )
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy;
if (incy < 0) y -= (n - 1) * incy * 2; if (incy < 0) y -= (n - 1) * incy * 2;
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
if (incx == 0 || incy == 0) if (incx == 0 || incy == 0)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.L3 # Makefile.L3
set(USE_TRMM false) set(USE_TRMM false)
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
set(USE_TRMM true) set(USE_TRMM true)
endif () endif ()

View File

@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL)
USE_TRMM = 1 USE_TRMM = 1
endif endif
ifeq ($(CORE), SKYLAKEX)
USE_TRMM = 1
endif
ifeq ($(CORE), ZEN) ifeq ($(CORE), ZEN)
USE_TRMM = 1 USE_TRMM = 1
endif endif

View File

@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble cdot_kernel_L999 ble cdot_kernel_L999
cmp INC_X, #0 # cmp INC_X, #0
beq cdot_kernel_L999 # beq cdot_kernel_L999
cmp INC_Y, #0 # cmp INC_Y, #0
beq cdot_kernel_L999 # beq cdot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne cdot_kernel_S_BEGIN bne cdot_kernel_S_BEGIN

View File

@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble ddot_kernel_L999 ble ddot_kernel_L999
cmp INC_X, #0 # cmp INC_X, #0
beq ddot_kernel_L999 # beq ddot_kernel_L999
cmp INC_Y, #0 # cmp INC_Y, #0
beq ddot_kernel_L999 # beq ddot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne ddot_kernel_S_BEGIN bne ddot_kernel_S_BEGIN

View File

@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble sdot_kernel_L999 ble sdot_kernel_L999
cmp INC_X, #0 # cmp INC_X, #0
beq sdot_kernel_L999 # beq sdot_kernel_L999
cmp INC_Y, #0 # cmp INC_Y, #0
beq sdot_kernel_L999 # beq sdot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne sdot_kernel_S_BEGIN bne sdot_kernel_S_BEGIN

View File

@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble zdot_kernel_L999 ble zdot_kernel_L999
cmp INC_X, #0 # cmp INC_X, #0
beq zdot_kernel_L999 # beq zdot_kernel_L999
cmp INC_Y, #0 # cmp INC_Y, #0
beq zdot_kernel_L999 # beq zdot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne zdot_kernel_S_BEGIN bne zdot_kernel_S_BEGIN

View File

@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0; FLOAT asum = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
asum = casum_compute(n, x, inc_x); asum = casum_compute(n, x, inc_x);

View File

@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if (n <= 0) return 0; if (n <= 0) return 0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
do_copy(n, x, inc_x, y, inc_y); do_copy(n, x, inc_x, y, inc_y);

View File

@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0; FLOAT asum = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
asum = dasum_compute(n, x, inc_x); asum = dasum_compute(n, x, inc_x);

View File

@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
" faddp "DOTF", v0.2d \n" " faddp "DOTF", v0.2d \n"
#endif /* !defined(DSDOT) */ #endif /* !defined(DSDOT) */
#else /* !defined(DOUBLE) */ #else /* !defined(DOUBLE) */
#define KERNEL_F1 \ #define KERNEL_F1 \
" ldr "TMPX", ["X"] \n" \ " ldr "TMPX", ["X"] \n" \
" ldr "TMPY", ["Y"] \n" \ " ldr "TMPY", ["Y"] \n" \
@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
RETURN_TYPE dot = 0.0; RETURN_TYPE dot = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || inc_y == 0 || n <= 10000)
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y); dot = dot_compute(n, x, inc_x, y, inc_y);

View File

@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return 0.0; if (n <= 0 || inc_x <= 0) return 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1);
if (n <= 10000) if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
nrm2_compute(n, x, inc_x, &ssq, &scale); nrm2_compute(n, x, inc_x, &ssq, &scale);

View File

@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return 0.0; if (n <= 0 || inc_x <= 0) return 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1);
if (n <= 10000) if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
nrm2 = nrm2_compute(n, x, inc_x); nrm2 = nrm2_compute(n, x, inc_x);

View File

@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG max_index = 0; BLASLONG max_index = 0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
max_index = iamax_compute(n, x, inc_x); max_index = iamax_compute(n, x, inc_x);

View File

@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG max_index = 0; BLASLONG max_index = 0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
max_index = izamax_compute(n, x, inc_x); max_index = izamax_compute(n, x, inc_x);

View File

@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0; FLOAT asum = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
asum = sasum_compute(n, x, inc_x); asum = sasum_compute(n, x, inc_x);

View File

@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return 0.0; if (n <= 0 || inc_x <= 0) return 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1);
if (n <= 10000) if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
nrm2_double = nrm2_compute(n, x, inc_x); nrm2_double = nrm2_compute(n, x, inc_x);

View File

@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0; FLOAT asum = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || n <= 10000)
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
asum = zasum_compute(n, x, inc_x); asum = zasum_compute(n, x, inc_x);

View File

@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
CIMAG(zdot) = 0.0; CIMAG(zdot) = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || inc_y == 0 || n <= 10000)
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
zdot_compute(n, x, inc_x, y, inc_y, &zdot); zdot_compute(n, x, inc_x, y, inc_y, &zdot);

View File

@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c
# #
SROTKERNEL = srot.c SROTKERNEL = srot.c
DROTKERNEL = drot.c DROTKERNEL = drot.c
#CROTKERNEL = ../arm/zrot.c CROTKERNEL = zrot.c
ZROTKERNEL = zrot.c ZROTKERNEL = zrot.c
# #
SSCALKERNEL = sscal.c SSCALKERNEL = sscal.c

View File

@ -647,7 +647,9 @@ static int get_l2_size_old(void){
return 6144; return 6144;
} }
} }
return 0; // return 0;
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
return 256;
} }
#endif #endif
@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){
l2 = BITMASK(ecx, 16, 0xffff); l2 = BITMASK(ecx, 16, 0xffff);
#ifndef ARCH_X86 #ifndef ARCH_X86
if (l2 <= 0) {
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
return 256;
}
return l2; return l2;
#else #else
@ -871,6 +877,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef SKYLAKEX
#ifdef DEBUG
fprintf(stderr, "SkylakeX\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef OPTERON #ifdef OPTERON

View File

@ -1,3 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN include $(KERNELDIR)/KERNEL.PENRYN
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c

View File

@ -138,6 +138,14 @@
/* INCX != 1 or INCY != 1 */ /* INCX != 1 or INCY != 1 */
.L14: .L14:
cmpl $0, %ebx
jne .L141
cmpl $0, %ecx
jne .L141
/* INCX == 0 and INCY == 0 */
jmp .L27
.L141:
movl %edx, %eax movl %edx, %eax
sarl $2, %eax sarl $2, %eax
jle .L28 jle .L28

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -0,0 +1,19 @@
include $(KERNELDIR)/KERNEL.HASWELL
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
#DTRMMKERNEL = ../generic/trmmkernel_16x2.c
#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S
#DGEMMINCOPY = ../generic/gemm_ncopy_16.c
#DGEMMITCOPY = ../generic/gemm_tcopy_16.c
#DGEMMONCOPY = ../generic/gemm_ncopy_2.c
#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c

View File

@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "caxpy_microk_steamroller-2.c" #include "caxpy_microk_steamroller-2.c"
#elif defined(BULLDOZER) #elif defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c" #include "caxpy_microk_bulldozer-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX)
#include "caxpy_microk_haswell-2.c" #include "caxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "caxpy_microk_sandy-2.c" #include "caxpy_microk_sandy-2.c"

View File

@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cdot_microk_bulldozer-2.c" #include "cdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "cdot_microk_steamroller-2.c" #include "cdot_microk_steamroller-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "cdot_microk_haswell-2.c" #include "cdot_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "cdot_microk_sandy-2.c" #include "cdot_microk_sandy-2.c"

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h> #include <stdio.h>
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(ZEN) #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "cgemv_n_microk_haswell-4.c" #include "cgemv_n_microk_haswell-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_n_microk_bulldozer-4.c" #include "cgemv_n_microk_bulldozer-4.c"

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(ZEN) #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "cgemv_t_microk_haswell-4.c" #include "cgemv_t_microk_haswell-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_t_microk_bulldozer-4.c" #include "cgemv_t_microk_bulldozer-4.c"

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(ZEN) #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "cscal_microk_haswell-2.c" #include "cscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) #elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "cscal_microk_bulldozer-2.c" #include "cscal_microk_bulldozer-2.c"

View File

@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_steamroller-2.c" #include "daxpy_microk_steamroller-2.c"
#elif defined(PILEDRIVER) #elif defined(PILEDRIVER)
#include "daxpy_microk_piledriver-2.c" #include "daxpy_microk_piledriver-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "daxpy_microk_haswell-2.c" #include "daxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "daxpy_microk_sandy-2.c" #include "daxpy_microk_sandy-2.c"

View File

@ -29,15 +29,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) #if defined(BULLDOZER)
#include "ddot_microk_bulldozer-2.c" #include "ddot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(EXCAVATOR) #elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ddot_microk_steamroller-2.c" #include "ddot_microk_steamroller-2.c"
#elif defined(PILEDRIVER) #elif defined(PILEDRIVER)
#include "ddot_microk_piledriver-2.c" #include "ddot_microk_piledriver-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c" #include "ddot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "ddot_microk_haswell-2.c" #include "ddot_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "ddot_microk_sandy-2.c" #include "ddot_microk_sandy-2.c"
@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
FLOAT temp1 = 0.0; FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0; FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while(i < n1) while(i < n1)
{ {
@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT dot = 0.0; FLOAT dot = 0.0;
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); if (inc_x == 0 || inc_y == 0 || n <= 10000)
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y); dot = dot_compute(n, x, inc_x, y, inc_y);

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM) #if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c" #include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
#include "dgemv_n_microk_haswell-4.c" #include "dgemv_n_microk_haswell-4.c"
#endif #endif

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
#include "dgemv_t_microk_haswell-4.c" #include "dgemv_t_microk_haswell-4.c"
#endif #endif

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dscal_microk_bulldozer-2.c" #include "dscal_microk_bulldozer-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "dscal_microk_sandy-2.c" #include "dscal_microk_sandy-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "dscal_microk_haswell-2.c" #include "dscal_microk_haswell-2.c"
#endif #endif

View File

@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_L_microk_bulldozer-2.c" #include "dsymv_L_microk_bulldozer-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "dsymv_L_microk_haswell-2.c" #include "dsymv_L_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "dsymv_L_microk_sandy-2.c" #include "dsymv_L_microk_sandy-2.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_U_microk_bulldozer-2.c" #include "dsymv_U_microk_bulldozer-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "dsymv_U_microk_haswell-2.c" #include "dsymv_U_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "dsymv_U_microk_sandy-2.c" #include "dsymv_U_microk_sandy-2.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM) #if defined(NEHALEM)
#include "saxpy_microk_nehalem-2.c" #include "saxpy_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "saxpy_microk_haswell-2.c" #include "saxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c" #include "saxpy_microk_sandy-2.c"

View File

@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sdot_microk_steamroller-2.c" #include "sdot_microk_steamroller-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c" #include "sdot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "sdot_microk_haswell-2.c" #include "sdot_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "sdot_microk_sandy-2.c" #include "sdot_microk_sandy-2.c"

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_n_microk_nehalem-4.c" #include "sgemv_n_microk_nehalem-4.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "sgemv_n_microk_sandy-4.c" #include "sgemv_n_microk_sandy-4.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "sgemv_n_microk_haswell-4.c" #include "sgemv_n_microk_haswell-4.c"
#endif #endif

View File

@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_t_microk_bulldozer-4.c" #include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c" #include "sgemv_t_microk_sandy-4.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "sgemv_t_microk_haswell-4.c" #include "sgemv_t_microk_haswell-4.c"
#endif #endif

View File

@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ssymv_L_microk_bulldozer-2.c" #include "ssymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "ssymv_L_microk_nehalem-2.c" #include "ssymv_L_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "ssymv_L_microk_haswell-2.c" #include "ssymv_L_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "ssymv_L_microk_sandy-2.c" #include "ssymv_L_microk_sandy-2.c"

View File

@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ssymv_U_microk_bulldozer-2.c" #include "ssymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "ssymv_U_microk_nehalem-2.c" #include "ssymv_U_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "ssymv_U_microk_haswell-2.c" #include "ssymv_U_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "ssymv_U_microk_sandy-2.c" #include "ssymv_U_microk_sandy-2.c"

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)

View File

@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zaxpy_microk_bulldozer-2.c" #include "zaxpy_microk_bulldozer-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zaxpy_microk_steamroller-2.c" #include "zaxpy_microk_steamroller-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "zaxpy_microk_haswell-2.c" #include "zaxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "zaxpy_microk_sandy-2.c" #include "zaxpy_microk_sandy-2.c"

View File

@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zdot_microk_bulldozer-2.c" #include "zdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "zdot_microk_steamroller-2.c" #include "zdot_microk_steamroller-2.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "zdot_microk_haswell-2.c" #include "zdot_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "zdot_microk_sandy-2.c" #include "zdot_microk_sandy-2.c"

View File

@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(ZEN) #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "zgemv_n_microk_haswell-4.c" #include "zgemv_n_microk_haswell-4.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "zgemv_n_microk_sandy-4.c" #include "zgemv_n_microk_sandy-4.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_t_microk_bulldozer-4.c" #include "zgemv_t_microk_bulldozer-4.c"
#elif defined(HASWELL) || defined(ZEN) #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "zgemv_t_microk_haswell-4.c" #include "zgemv_t_microk_haswell-4.c"
#endif #endif

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(ZEN) #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "zscal_microk_haswell-2.c" #include "zscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) #elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "zscal_microk_bulldozer-2.c" #include "zscal_microk_bulldozer-2.c"

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)

Some files were not shown because too many files have changed in this diff Show More