Merge branch 'develop' into develop

This commit is contained in:
Abdelrauf 2019-01-16 19:25:13 +04:00 committed by GitHub
commit a034e65512
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
410 changed files with 27279 additions and 3020 deletions

View File

@ -4,10 +4,10 @@ dist: precise
sudo: true sudo: true
language: c language: c
jobs: matrix:
include: include:
- &test-ubuntu - &test-ubuntu
stage: test os: linux
compiler: gcc compiler: gcc
addons: addons:
apt: apt:
@ -57,7 +57,7 @@ jobs:
- TARGET_BOX=LINUX32 - TARGET_BOX=LINUX32
- BTYPE="BINARY=32" - BTYPE="BINARY=32"
- stage: test - os: linux
compiler: gcc compiler: gcc
addons: addons:
apt: apt:
@ -77,13 +77,13 @@ jobs:
# which is slower than container-based infrastructure used for jobs # which is slower than container-based infrastructure used for jobs
# that don't require sudo. # that don't require sudo.
- &test-alpine - &test-alpine
stage: test os: linux
dist: trusty dist: trusty
sudo: true sudo: true
language: minimal language: minimal
before_install: before_install:
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
install: install:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
@ -117,10 +117,10 @@ jobs:
- <<: *test-alpine - <<: *test-alpine
env: env:
- TARGET_BOX=LINUX64_MUSL - TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
- &test-cmake - &test-cmake
stage: test os: linux
compiler: clang compiler: clang
addons: addons:
apt: apt:
@ -147,6 +147,58 @@ jobs:
env: env:
- CMAKE=1 - CMAKE=1
- &test-macos
os: osx
osx_image: xcode8.3
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
- brew install gcc # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
env:
- BTYPE="BINARY=32"
- &emulated-arm
dist: trusty
sudo: required
services: docker
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
name: "Emulated Build for ARMV6 with gcc"
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
script: |
echo "FROM openblas/alpine:${IMAGE_ARCH}
COPY . /tmp/openblas
RUN mkdir /tmp/openblas/build && \
cd /tmp/openblas/build && \
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
-D TARGET=${TARGET_ARCH} \
-D BUILD_SHARED_LIBS=ON \
-D BUILD_WITHOUT_LAPACK=ON \
-D BUILD_WITHOUT_CBLAS=ON \
-D CMAKE_BUILD_TYPE=Release ../ && \
cmake --build ." > Dockerfile
docker build .
- <<: *emulated-arm
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
name: "Emulated Build for ARMV6 with clang"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
name: "Emulated Build for ARMV8 with gcc"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
name: "Emulated Build for ARMV8 with clang"
allow_failures:
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
# whitelist # whitelist
branches: branches:
only: only:

View File

@ -6,21 +6,30 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM) project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 0.dev) set(OpenBLAS_PATCH_VERSION 6.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions # Adhere to GNU filesystem layout conventions
include(GNUInstallDirs) include(GNUInstallDirs)
set(OpenBLAS_LIBNAME openblas) include(CMakePackageConfigHelpers)
####### #######
if(MSVC) if(MSVC)
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif() endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
####### #######
if(BUILD_WITHOUT_LAPACK) if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1) set(NO_LAPACK 1)
@ -34,11 +43,13 @@ endif()
####### #######
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
set(BLASDIRS interface driver/level2 driver/level3 driver/others) set(BLASDIRS interface driver/level2 driver/level3 driver/others)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
@ -146,6 +157,7 @@ endif()
# add objects to the openblas lib # add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
# Android needs to explicitly link against libm # Android needs to explicitly link against libm
if(ANDROID) if(ANDROID)
@ -165,6 +177,7 @@ endif()
# Set output for libopenblas # Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
@ -204,14 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION}
) )
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED BU)
set(BU _)
endif()
if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
endif()
# Install project # Install project
# Install libraries # Install libraries
install(TARGETS ${OpenBLAS_LIBNAME} install(TARGETS ${OpenBLAS_LIBNAME}
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
# Install headers
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
@ -259,11 +342,31 @@ if(NOT NO_LAPACKE)
ADD_CUSTOM_TARGET(genlapacke ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
) )
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif() endif()
include(FindPkgConfig QUIET) include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND) if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif() endif()
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
VERSION ${${PN}_VERSION}
COMPATIBILITY AnyNewerVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@ -1,4 +1,247 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.3.5
31-Dec-2018
common:
* loop unrolling in TRMV has been enabled again.
* A domain error in the thread workload distribution for SYRK
has been fixed.
* gmake builds will now automatically add -fPIC to the build
options if the platform requires it.
* a pthreads key leakage (and associate crash on dlclose) in
the USE_TLS codepath was fixed.
* building of the utest cases on systems that do not provide
an implementation of complex.h was fixed.
x86_64:
* the SkylakeX code was changed to compile on OSX.
* unwanted application of the -march=skylake-avx512 option
to the common code parts of a DYNAMIC_ARCH build was fixed.
* improved performance of SGEMM for small workloads on Skylake X.
* performance of SGEMM and DGEMM was improved on Haswell.
ARMV8:
* a configuration error that broke the CNRM2 kernel was corrected.
* compilation of the GEMM kernels with CMAKE was fixed.
* DYNAMIC_ARCH builds are now available with CMAKE as well.
* using CMAKE for cross-compilation to the new cpu TARGETs
introduced in 0.3.4 now works.
POWER:
* a problem in cpu autodetection for AIX has been corrected.
====================================================================
Version 0.3.4
02-Dec-2018
common:
* the new, experimental thread-local memory allocation had
inadvertently been left enabled for gmake builds in 0.3.3
despite the announcement. It is now disabled by default, and
single-threaded builds will keep using the old allocator even
if the USE_TLS option is turned on.
* OpenBLAS will now provide enough buffer space for at least 50
threads by default.
* The output of openblas_get_config() now contains the version
number.
* A serious thread safety bug in GEMV operation with small M and
large N size has been fixed.
* The code will now automatically call blas_thread_init after a
fork if needed before handling a call to openblas_set_num_threads
* Accesses to parallelized level3 functions from multiple callers
are now serialized to avoid thread races (unless using OpenMP).
This should provide better performance than the known-threadsafe
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
* When building LAPACK with gfortran, -frecursive is now (again)
enabled by default to ensure correct behaviour.
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
CBLAS_LAYOUT as the name of the matrix row/column order option.
* Externally set LDFLAGS are now passed through to the final compile/link
steps to facilitate setting platform-specific linker flags.
* A potential race condition during the build of LAPACK (that would
usually manifest itself as a failure to build TESTING/MATGEN) has been
fixed.
* xHEMV has been changed to stay single-threaded for small input sizes
where the overhead of multithreading exceeds any possible gains
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
ThunderX hardware with sizable input.
* Linker flags for the PGI compiler have been updated
* Behaviour of AXPY with zero increments is now handled in the C interface,
correcting the result on at least Intel Atom.
* The result matrix from calling SGELSS with an all-zero input matrix is
now zeroed completely.
x86_64:
* Autodetection of AMD Ryzen2 has been fixed (again).
* CMAKE builds now support labeling of an INTERFACE64=1 build of
the library with the _64 suffix.
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
has been sped up by rewriting with C intrinsics
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
POWER:
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
* CPU type detection has been implemented for AIX.
* CPU type detection has been fixed for NETBSD.
MIPS64:
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
* DSDOT on LOONGSON3A has been fixed.
* the SGEMM microkernel has been hardened against potential data loss.
ARMV8:
* DYNAMic_ARCH support is now available for 64bit ARM
* cross-compiling for ARMV8 under iOS now works.
* cpu-specific code has been rearranged to make better use of both
hardware commonalities and model-specific compiler optimizations.
* XGENE1 has been removed as a TARGET, superseded by the improved generic
ARMV8 support.
ARMV7:
* Older assembly mnemonics have been converted to UAL form to allow
building with clang 7.0
* Cross compiling LAPACKE for Android has been fixed again (broken by
update to LAPACK 3.7.0 some while ago).
====================================================================
Version 0.3.3
31-Aug-2018
common:
* thread memory allocation has been switched back to the method
used before version 0.3.1 due to unexpected problems caused by
the new code under some circumstances. A new compile-time option
USE_TLS has been added to enable the new code, and it is hoped
that this can become the default again in the next version.
* LAPAck PR272 has been integrated, which fixes spurious errors
in DSYEVR and related functions caused by missing conversion
from ILAENV to ILAENV_2STAGE in several _2stage routines.
* the cmake-generated OpenBLASConfig.cmake now uses correct case
for the name of the library
* added support for Haiku OS
x86_64:
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
DSCAL, DGEMVN and DSYMVL
* added a workaround for a cygwin issue that prevented compilation
of AVX512 code
IBM Z:
* added autodetection of Z14
* fixed TRMM errors in the generic target
====================================================================
Version 0.3.2
30-Jul-2018
common:
* fixes for regressions caused by the rewrite of the thread
initialization code in 0.3.1
POWER:
* fixed cpu autodetection for the BSDs
MIPS64:
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
x86_64:
* added autodetection of AMD Ryzen 2
* fixed build with older versions of MSVC
====================================================================
Version 0.3.1
01-Jul-2018
common:
* rewritten thread initialization code with significantly reduced overhead
* added CBLAS interfaces to the IxAMIN BLAS extension functions
* fixed the lapack-test target
* CMAKE builds now create an OpenBLASConfig.cmake file
* ZAXPY now uses a single thread for small input sizes
* the LAPACK code was updated from Reference-LAPACK/lapack#253
(fixing LAPACKE interfaces to Aasen's functions)
POWER:
* corrected CROT and ZROT behaviour with zero INC_X
ARMV7:
* corrected xDOT behaviour with zero INC_X or INC_Y
x86_64:
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
specify the list of x86_64 targets to include. Any target not on the list will be supported
by the Sandybridge or Nehalem kernels if available, or by Prescott.
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
* added autodetection of Intel Cannon Lake series as Skylake X
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
recent mingw from MSYS2
* fixed a link error in mixed clang/gfortran builds with OpenMP
* updated the OSX deployment target to 10.8
* switched on parallel make for builds on MS Windows by default
x86:
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
====================================================================
Version 0.3.0
23-May-2108
common:
* fixed some more thread race and locking bugs
* added preliminary support for calling an OpenMP build of the library from multiple threads
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
* general code cleanup
* optimized DSDOT implementation
* improved thread distribution for GEMM
* corrected IMATCOPY/OMATCOPY implementation
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
* cmake build improvements
* pkgconfig file now contains build options
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
* corrections and improvements for systems with more than 64 cpus
* LAPACK code updated to 3.8.0 including later fixes
* added ReLAPACK, a recursive implementation of several LAPACK functions
* Rewrote ROTMG to handle cases that the netlib code failed to address
* Disabled (broken) multithreading code for xTRMV
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
* shared memory access failures on startup are now handled more gracefully
* restored utests from earlier releases (and made them pass on all affected systems)
SPARC:
* several fixes for cpu autodetection
POWER:
* corrected vector register overwriting in several Power8 kernels
* optimized additional BLAS functions
ARM:
* added support for CortexA53 and A72
* added autodetection for ThunderX2T99
* made most optimized kernels the default for generic ARMv8 targets
x86_64:
* parallelized DDOT kernel for Haswell
* changed alignment directives in assembly kernels to boost performance on OSX
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
* added support for building on OpenBSD and Dragonfly
* updated compiler options to work with Intel release 2018
* support fully optimized build with clang/flang on Microsoft Windows
* fixed building on AIX
IBM Z:
* added optimized BLAS 1/2 functions
MIPS:
* fixed cpu autodetection helper code
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
* added mips64 I6500 cpu
==================================================================== ====================================================================
Version 0.2.20 Version 0.2.20
24-Jul-2017 24-Jul-2017

View File

@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack RELA = re_lapack
endif endif
ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@ -47,7 +58,7 @@ endif
endif endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))" @echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif endif
ifneq ($(OSNAME), AIX) ifneq ($(OSNAME), AIX)
@ -86,16 +97,12 @@ endif
shared : shared :
ifndef NO_SHARED ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
@ -112,7 +119,7 @@ endif
endif endif
tests : tests :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME) touch $(LIBNAME)
ifndef NO_FBLAS ifndef NO_FBLAS
$(MAKE) -C test all $(MAKE) -C test all
@ -124,7 +131,7 @@ endif
endif endif
libs : libs :
ifeq ($(CORE), UNKOWN) ifeq ($(CORE), UNKNOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif endif
ifeq ($(NOFORTRAN), 1) ifeq ($(NOFORTRAN), 1)
@ -157,6 +164,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done done
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
ifeq ($(DYNAMIC_OLDER), 1)
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
endif
endif endif
ifdef USE_THREAD ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
@ -211,7 +221,7 @@ netlib :
else else
netlib : lapack_prebuild netlib : lapack_prebuild
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif endif
@ -232,7 +242,7 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack_prebuild : lapack_prebuild :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -241,7 +251,7 @@ ifndef NOFORTRAN
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -257,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
ifdef SMP ifdef SMP
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else ifeq ($(OSNAME), Haiku)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif endif
@ -275,21 +287,21 @@ endif
endif endif
large.tgz : large.tgz :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz; -wget http://www.netlib.org/lapack/timing/large.tgz;
fi fi
endif endif
timing.tgz : timing.tgz :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz; -wget http://www.netlib.org/lapack/timing/timing.tgz;
fi fi
endif endif
lapack-timing : large.tgz timing.tgz lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
@ -298,9 +310,10 @@ endif
lapack-test : lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1) ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion ) ./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif endif
@ -312,9 +325,9 @@ lapack-runtest:
blas-test: blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
dummy : dummy :

View File

@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a
endif endif
ifeq ($(CORE), CORTEXA57) ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
endif endif
ifeq ($(CORE), VULCAN) ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
endif
ifeq ($(CORE), CORTEXA72)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
ifeq ($(CORE), CORTEXA73)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif endif
ifeq ($(CORE), THUNDERX) ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx FCOMMON_OPT += -march=armv8-a -mtune=thunderx
endif
ifeq ($(CORE), FALKOR)
CCOMMON_OPT += -march=armv8-a -mtune=falkor
FCOMMON_OPT += -march=armv8-a -mtune=falkor
endif endif
ifeq ($(CORE), THUNDERX2T99) ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif endif

View File

@ -48,6 +48,7 @@ ifndef NO_CBLAS
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@ -66,18 +67,14 @@ endif
#for install shared library #for install shared library
ifndef NO_SHARED ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ -98,11 +95,39 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif endif
endif endif
else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
#Generating openblas.pc #Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@ -115,7 +140,7 @@ endif
ifndef NO_SHARED ifndef NO_SHARED
#ifeq logical or #ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))

View File

@ -17,6 +17,10 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99 EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif endif
ifeq ($(TARGET), 1004K)
TARGET_FLAGS = -mips32r2
endif
ifeq ($(TARGET), P5600) ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5 TARGET_FLAGS = -mips32r5
endif endif

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.0.dev VERSION = 0.3.6.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
# If you want to support multiple architecture in one binary # If you want to support multiple architecture in one binary
# DYNAMIC_ARCH = 1 # DYNAMIC_ARCH = 1
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
# DYNAMIC_OLDER = 1
# C compiler including binary type(32bit / 64bit). Default is gcc. # C compiler including binary type(32bit / 64bit). Default is gcc.
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
# CC = gcc # CC = gcc
@ -55,11 +60,26 @@ VERSION = 0.3.0.dev
# This flag is always set for POWER8. Don't modify the flag # This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1 # USE_OPENMP = 1
# The OpenMP scheduler to use - by default this is "static" and you
# will normally not want to change this unless you know that your main
# workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define maximum number of threads. Basically it should be # You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's # less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script. # automatically detected by the the script.
# NUM_THREADS = 24 # NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in. # if you don't need to install the static library, please comment it in.
# NO_STATIC = 1 # NO_STATIC = 1
@ -89,6 +109,12 @@ BUILD_LAPACK_DEPRECATED = 1
# If you want to use legacy threaded Level 3 implementation. # If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1 # USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses
# thread-local storage instead of a central memory buffer in memory.c
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
# for this to work.
# USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran # If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you # compiler supports this. It's safe to keep comment it out if you
# are not sure(equivalent to "-i8" option). # are not sure(equivalent to "-i8" option).
@ -100,7 +126,7 @@ BUILD_LAPACK_DEPRECATED = 1
NO_WARMUP = 1 NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux. # If you want to disable CPU/Memory affinity on Linux.
#NO_AFFINITY = 1 NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1 # BIGNUMA = 1
@ -126,6 +152,9 @@ NO_WARMUP = 1
# FUNCTION_PROFILE = 1 # FUNCTION_PROFILE = 1
# Support for IEEE quad precision(it's *real* REAL*16)( under testing) # Support for IEEE quad precision(it's *real* REAL*16)( under testing)
# This option should not be used - it is a holdover from unfinished code present
# in the original GotoBLAS2 library that may be usable as a starting point but
# is not even expected to compile in its present form.
# QUAD_PRECISION = 1 # QUAD_PRECISION = 1
# Theads are still working for a while after finishing BLAS operation # Theads are still working for a while after finishing BLAS operation
@ -144,8 +173,11 @@ NO_WARMUP = 1
# CONSISTENT_FPCSR = 1 # CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading # with single thread. (Actually in recent versions this is a factor proportional to the
# in small matrix sizes. The default value is 4. # number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4 # GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very # If you need santy check by comparing reference BLAS. It'll be very
@ -160,8 +192,8 @@ NO_WARMUP = 1
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2 # COMMON_OPT = -O2
# gfortran option for LAPACK # gfortran option for LAPACK to improve thread-safety
# enable this flag only on 64bit Linux and if you need a thread safe lapack library # It is enabled by default in Makefile.system for gfortran
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive # FCOMMON_OPT = -frecursive

View File

@ -9,6 +9,17 @@ ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# Default C compiler # Default C compiler
@ -17,15 +28,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables # http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW). # - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default) ifeq ($(origin CC),default)
# Check if $(CC) refers to a valid command and set the value to gcc if not
ifneq ($(findstring cmd.exe,$(SHELL)),)
ifeq ($(shell where $(CC) 2>NUL),)
CC = gcc CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif endif
else # POSIX-ish
ifeq ($(shell command -v $(CC) 2>/dev/null),)
ifeq ($(shell uname -s),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
else
CC = gcc
endif # Darwin
endif # CC exists
endif # Shell is sane
endif # CC is set to default
# Default Fortran compiler (FC) is selected by f_check. # Default Fortran compiler (FC) is selected by f_check.
@ -45,6 +65,7 @@ endif
ifdef TARGET ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET) GETARCH_FLAGS := -DFORCE_$(TARGET)
GETARCH_FLAGS += -DUSER_TARGET
endif endif
# Force fallbacks for 32bit # Force fallbacks for 32bit
@ -53,6 +74,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL) ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE) ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -86,6 +110,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL) ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET_CORE), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE) ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -132,6 +159,10 @@ ifeq ($(NO_AVX2), 1)
GETARCH_FLAGS += -DNO_AVX2 GETARCH_FLAGS += -DNO_AVX2
endif endif
ifeq ($(NO_AVX512), 1)
GETARCH_FLAGS += -DNO_AVX512
endif
ifeq ($(DEBUG), 1) ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g GETARCH_FLAGS += -g
endif endif
@ -175,6 +206,10 @@ endif
endif endif
ifndef NUM_PARALLEL
NUM_PARALLEL = 1
endif
ifndef NUM_THREADS ifndef NUM_THREADS
NUM_THREADS = $(NUM_CORES) NUM_THREADS = $(NUM_CORES)
endif endif
@ -225,12 +260,12 @@ endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6 export MACOSX_DEPLOYMENT_TARGET=10.8
endif endif
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
ifeq ($(OSNAME), FreeBSD) ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
@ -304,6 +339,7 @@ endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
NEED_PIC = 0 NEED_PIC = 0
NO_EXPRECISION = 1 NO_EXPRECISION = 1
OS_CYGWIN_NT = 1
endif endif
ifneq ($(OSNAME), WINNT) ifneq ($(OSNAME), WINNT)
@ -423,7 +459,7 @@ CCOMMON_OPT += -fopenmp
endif endif
ifeq ($(C_COMPILER), INTEL) ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -openmp CCOMMON_OPT += -fopenmp
endif endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
@ -448,13 +484,44 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += PENRYN DUNNINGTON
endif
DYNAMIC_CORE += NEHALEM
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += OPTERON OPTERON_SSE3
endif
DYNAMIC_CORE += BARCELONA
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += BOBCAT ATOM NANO
endif
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif endif
ifneq ($(NO_AVX2), 1) ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL ZEN DYNAMIC_CORE += HASWELL ZEN
endif endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX
endif
endif
endif
ifdef DYNAMIC_LIST
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
CCOMMON_OPT += $(XCCOMMON_OPT)
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
endif endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@ -554,9 +621,14 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64
endif endif
ifeq ($(CORE), 1004K)
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
endif
ifeq ($(CORE), P5600) ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif endif
ifeq ($(CORE), I6400) ifeq ($(CORE), I6400)
@ -660,6 +732,8 @@ endif
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1) ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran EXTRALIB += -lgfortran
@ -703,7 +777,7 @@ FCOMMON_OPT += -i8
endif endif
endif endif
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp FCOMMON_OPT += -fopenmp
endif endif
endif endif
@ -883,6 +957,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH CCOMMON_OPT += -DDYNAMIC_ARCH
endif endif
ifeq ($(DYNAMIC_OLDER), 1)
CCOMMON_OPT += -DDYNAMIC_OLDER
endif
ifeq ($(NO_LAPACK), 1) ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK CCOMMON_OPT += -DNO_LAPACK
#Disable LAPACK C interface #Disable LAPACK C interface
@ -905,6 +983,10 @@ ifeq ($(NO_AVX2), 1)
CCOMMON_OPT += -DNO_AVX2 CCOMMON_OPT += -DNO_AVX2
endif endif
ifeq ($(NO_AVX512), 1)
CCOMMON_OPT += -DNO_AVX512
endif
ifdef SMP ifdef SMP
CCOMMON_OPT += -DSMP_SERVER CCOMMON_OPT += -DSMP_SERVER
@ -951,10 +1033,18 @@ endif
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
ifdef USE_SIMPLE_THREADED_LEVEL3 ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif endif
ifdef USE_TLS
CCOMMON_OPT += -DUSE_TLS
endif
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
ifndef SYMBOLPREFIX ifndef SYMBOLPREFIX
SYMBOLPREFIX = SYMBOLPREFIX =
endif endif
@ -1065,8 +1155,6 @@ ifndef FCOMMON_OPT
FCOMMON_OPT = -O2 -frecursive FCOMMON_OPT = -O2 -frecursive
endif endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
@ -1074,6 +1162,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES = #MAKEOVERRIDES =
ifdef NEED_PIC
ifeq (,$(findstring PIC,$(FFLAGS)))
override FFLAGS += -fPIC
endif
endif
#For LAPACK Fortran codes. #For LAPACK Fortran codes.
#Disable -fopenmp for LAPACK Fortran codes on Windows. #Disable -fopenmp for LAPACK Fortran codes on Windows.
ifdef OS_WINDOWS ifdef OS_WINDOWS
@ -1132,7 +1226,11 @@ endif
LIBDLLNAME = $(LIBPREFIX).dll LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
@ -1209,6 +1307,7 @@ export MSA_FLAGS
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE
export NO_AVX512
export SGEMM_UNROLL_M export SGEMM_UNROLL_M
export SGEMM_UNROLL_N export SGEMM_UNROLL_N

View File

@ -8,6 +8,34 @@ endif
endif endif
endif endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
CCOMMON_OPT += -mavx2
FCOMMON_OPT += -mavx2
endif
endif
endif
ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64 ARFLAGS = -m x64
endif endif

230
README.md
View File

@ -5,175 +5,221 @@
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages ## Binary Packages
We provide binary packages for the following platform.
We provide official binary packages for the following platform:
* Windows x86/x86_64 * Windows x86/x86_64
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
## Installation from Source ## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
Building OpenBLAS requires the following to be installed:
* GNU Make
* A C compiler, e.g. GCC or Clang
* A Fortran compiler (optional, for LAPACK)
* IBM MASS (optional, see below)
### Normal compile ### Normal compile
* type "make" to detect the CPU automatically.
or Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
The full target list is in the file `TargetList.txt`.
### Cross compile ### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
The target must be specified explicitly when cross compiling.
Examples: Examples:
On X86 box, compile this library for loongson3a CPU. * On an x86 box, compile this library for a loongson3a CPU:
```sh
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
```
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
```sh
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
```
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
### Debug version ### Debug version
make DEBUG=1 A debug version can be built using `make DEBUG=1`.
### Compile with MASS Support on Power CPU (Optional dependency) ### Compile with MASS support on Power CPU (optional)
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. consists of a set of mathematical functions for C, C++, and Fortran applications that are
The library can be installed as below - are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
* On Ubuntu: * On Ubuntu:
```sh
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.5
```
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br> * On RHEL/CentOS:
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br> ```sh
sudo apt-get update</br> wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo apt-get install libxlmass-devel.8.1.5</br> sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.5
```
* On RHEL/CentOS: After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br> ### Install to a specific directory (optional)
sudo rpm --import repomd.xml.key</br>
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
sudo yum install libxlmass-devel.8.1.5</br>
After installing MASS library, compile openblas with USE_MASS=1. Use `PREFIX=` when invoking `make`, for example
Example: ```sh
make install PREFIX=your_installation_directory
```
Compiling on Power8 with MASS support - The default installation directory is `/opt/OpenBLAS`.
make USE_MASS=1 TARGET=POWER8 ## Supported CPUs and Operating Systems
### Install to the directory (optional) Please read `GotoBLAS_01Readme.txt`.
Example: ### Additional supported CPUs
make install PREFIX=your_installation_directory #### x86/x86-64
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64: #### MIPS64
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental - **ICT Loongson 3B**: Experimental
#### ARM: #### ARM
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
#### ARM64: - **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
- **ARMV8**: Experimental - **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
#### ARM64
- **ARMv8**: Experimental
- **ARM Cortex-A57**: Experimental - **ARM Cortex-A57**: Experimental
#### PPC/PPC64 #### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
#### IBM zEnterprise System: - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
### Supported OS
### Support OS:
- **GNU/Linux** - **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. - **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
- **FreeBSD**: Supported by community. We didn't test the library on this OS. - **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. - **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages ## Usage
Link with libopenblas.a or -lopenblas for shared library.
### Set the number of threads with environment variables. Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
compiled as a shared library.
Examples: ### Setting the number of threads using environment variables
export OPENBLAS_NUM_THREADS=4 Environment variables are used to specify a maximum number of threads.
For example,
or ```sh
export OPENBLAS_NUM_THREADS=4
export GOTO_NUM_THREADS=4
export OMP_NUM_THREADS=4
```
export GOTO_NUM_THREADS=4 The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
or If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
compiled with `USE_OPENMP=1`.
export OMP_NUM_THREADS=4 ### Setting the number of threads at runtime
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. We provide the following functions to control the number of threads at runtime:
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. ```c
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
```
### Set the number of threads on runtime. If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
We provided the below functions to control the number of threads on runtime. ## Reporting bugs
void goto_set_num_threads(int num_threads); Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
## Contact ## Contact
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
## ChangeLog ## Change log
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
## Troubleshooting ## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). Clang 3.0 will generate the wrong AVX binary code.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. * Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default.
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
Makefile.rule. However, note that this may cause
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
However, it will be okay when you run the same test case on the shell.
## Contributing ## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
1. Write a test which shows that the bug was fixed or that the feature works as expected. to start a discussion around a feature idea or a bug.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. 2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
3. Write a test which shows that the bug was fixed or that the feature works as expected.
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
## Donation ## Donation
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).

View File

@ -20,6 +20,7 @@ DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE SANDYBRIDGE
HASWELL HASWELL
SKYLAKEX
ATOM ATOM
b)AMD CPU: b)AMD CPU:
@ -56,6 +57,7 @@ CELL
3.MIPS CPU: 3.MIPS CPU:
P5600 P5600
1004K
4.MIPS64 CPU: 4.MIPS64 CPU:
SICORTEX SICORTEX
@ -81,8 +83,11 @@ ARMV5
8.ARM 64-bit CPU: 8.ARM 64-bit CPU:
ARMV8 ARMV8
CORTEXA53
CORTEXA57 CORTEXA57
VULCAN CORTEXA72
CORTEXA73
FALKOR
THUNDERX THUNDERX
THUNDERX2T99 THUNDERX2T99

View File

@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`. `MAX_CPU_NUMBER=NUM_THREADS`.
Despite its name, and due to the use of memory buffers in functions like SGEMM,
the setting of NUM_THREADS can be relevant even for a single-threaded build
of OpenBLAS, if such functions get called by multiple threads of a program
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
a segmentation fault without displaying the above warning first.
Note that the number of threads used at runtime can be altered to differ from the
value NUM_THREADS was set to at build time. At runtime, the actual number of
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
that this does not change the number of memory buffers that will be allocated,
which is set at build time). The number of threads for a process can be set by
using the mechanisms described below.
#### How can I use OpenBLAS in multi-threaded applications? #### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS If your application is already multi-threaded, it will conflict with OpenBLAS

View File

@ -237,7 +237,7 @@ int main(int argc, char *argv[]){
timeg = time1/loops; timeg = time1/loops;
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops %10.6f sec\n", " %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1); COMPSIZE * COMPSIZE * 2. * (double)k * (double)m * (double)n / timeg * 1.e-6, time1);
} }

View File

@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0}; FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0}; FLOAT beta [] = {1.0, 0.0};
char trans='N'; char trans='N';
blasint m, i, j; blasint m, i, j;
blasint inc_x=1,inc_y=1; blasint inc_x=1,inc_y=1;

22
c_check
View File

@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/); $os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/);
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
$os = Darwin if ($data =~ /OS_DARWIN/); $os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/); $os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/); $os = AIX if ($data =~ /OS_AIX/);
@ -62,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/); $os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/); $os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@ -199,6 +202,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32; $binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/); $binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/; $data =~ /globl\s([_\.]*)(.*)/;
@ -206,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1; $need_fu = $1;
$cross = 0; $cross = 0;
$cross = 1 if ($os ne $hostos);
if ($architecture ne $hostarch) { if ($architecture ne $hostarch) {
$cross = 1; $cross = 1;
@ -214,6 +231,8 @@ if ($architecture ne $hostarch) {
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
} }
$cross = 1 if ($os ne $hostos);
$openmp = "" if $ENV{USE_OPENMP} != 1; $openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = ""; $linker_L = "";
@ -286,6 +305,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
$os =~ tr/[a-z]/[A-Z]/; $os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/;

View File

@ -51,6 +51,7 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
typedef CBLAS_ORDER CBLAS_LAYOUT;
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
@ -82,6 +83,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@ -0,0 +1,79 @@
# OpenBLASConfig.cmake
# --------------------
#
# OpenBLAS cmake module.
# This module sets the following variables in your project::
#
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
# OpenBLAS_INCLUDE_DIR - same as DIRS
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
# OpenBLAS_LIBRARY - same as LIBRARIES
#
#
# Available components::
#
## shared - search for only shared library
## static - search for only static library
# serial - search for unthreaded library
# pthread - search for native pthread threaded library
# openmp - search for OpenMP threaded library
#
#
# Exported targets::
#
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
## target. Target is shared _or_ static, so, for both, use separate, not
## overlapping, installations. ::
#
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
#
#
# Suggested usage::
#
# find_package(OpenBLAS)
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
#
#
# The following variables can be set to guide the search for this package::
#
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
# PATH - environment variable, set to bin directory of this package
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
@PACKAGE_INIT@
set(PN OpenBLAS)
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
if(@USE_OPENMP@)
set(${PN}_openmp_FOUND 1)
elseif(@USE_THREAD@)
set(${PN}_pthread_FOUND 1)
else()
set(${PN}_serial_FOUND 1)
endif()
check_required_components(${PN})
#-----------------------------------------------------------------------------
# Don't include targets if this file is being picked up by another
# project which has already built this as a subproject
#-----------------------------------------------------------------------------
if(NOT TARGET ${PN}::OpenBLAS)
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
set(${PN}_LIBRARY ${_loc})
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
set(${PN}_LIBRARIES ${_ill})
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIR ${_id})
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIRS ${_iid})
endif()

View File

@ -44,18 +44,36 @@ endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
endif ()
if (X86) if (X86)
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
endif () endif ()
if (X86_64) if (X86_64)
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) set(DYNAMIC_CORE PRESCOTT CORE2)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
endif ()
if (NOT NO_AVX) if (NOT NO_AVX)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
endif () endif ()
if (NOT NO_AVX2) if (NOT NO_AVX2)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif () endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
endif () endif ()
if (NOT DYNAMIC_CORE) if (NOT DYNAMIC_CORE)

View File

@ -3,6 +3,11 @@
## Description: Ported from portion of OpenBLAS/Makefile.system ## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables. ## Sets Fortran related variables.
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (${F_COMPILER} STREQUAL "FLANG") if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64) if (BINARY64 AND INTERFACE64)
@ -39,7 +44,7 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN") if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran") set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@ -1,9 +1,11 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@ Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas Libs: -L${libdir} -lopenblas${libsuffix}
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@ -85,15 +85,20 @@ if (NOT NOFORTRAN)
endif () endif ()
# Cannot run getarch on target if we are cross-compiling # Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING) if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would # Write to config as getarch would
if (DEFINED TARGET_CORE)
set(TCORE ${TARGET_CORE})
else()
set(TCORE ${CORE})
endif()
# TODO: Set up defines that getarch sets up based on every other target # TODO: Set up defines that getarch sets up based on every other target
# Perhaps this should be inside a different file as it grows larger # Perhaps this should be inside a different file as it grows larger
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define ${CORE}\n" "#define ${TCORE}\n"
"#define CHAR_CORENAME \"${CORE}\"\n") "#define CHAR_CORENAME \"${TCORE}\"\n")
if ("${CORE}" STREQUAL "ARMV7") if ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n" "#define L1_DATA_LINESIZE\t32\n"
@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4) set(DGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "ARMV8") elseif ("${TCORE}" STREQUAL "ARMV8")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n" "#define L1_DATA_LINESIZE\t64\n"
@ -116,18 +121,26 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
"#define L2_LINESIZE\t64\n" "#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n" "#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t32\n") "#define L2_ASSOCIATIVE\t32\n"
set(SGEMM_UNROLL_M 4) "#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "CORTEXA57") set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n" "#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n" "#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n" "#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n" "#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t2097152\n" "#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n" "#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n" "#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n"
@ -135,15 +148,124 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
"#define HAVE_VFPV4\n" "#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n" "#define HAVE_VFPV3\n"
"#define HAVE_VFP\n" "#define HAVE_VFP\n"
"#define HAVE_NEON\n") "#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4) set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4) set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t167772164\n"
"#define L2_LINESIZE\t128\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t8\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t8\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define L3_SIZE\t33554432\n"
"#define L3_LINESIZE\t64\n"
"#define L3_ASSOCIATIVE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
endif() endif()
# Or should this actually be NUM_CORES? # Or should this actually be NUM_CORES?
@ -163,6 +285,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n") "#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
# Move to where gen_config_h would place it # Move to where gen_config_h would place it
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
else(NOT CMAKE_CROSSCOMPILING) else(NOT CMAKE_CROSSCOMPILING)

View File

@ -33,7 +33,7 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.") message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1) set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
set(TARGET "NEHALEM") set(TARGET "NEHALEM")
endif () endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@ -41,6 +41,22 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
endif () endif ()
endif () endif ()
if (DEFINED TARGET)
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
endif()
if (DEFINED TARGET) if (DEFINED TARGET)
message(STATUS "Targeting the ${TARGET} architecture.") message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}") set(GETARCH_FLAGS "-DFORCE_${TARGET}")
@ -96,8 +112,12 @@ if (NOT CMAKE_CROSSCOMPILING)
endif() endif()
if (NOT DEFINED NUM_PARALLEL)
set(NUM_PARALLEL 1)
endif()
if (NOT DEFINED NUM_THREADS) if (NOT DEFINED NUM_THREADS)
if (NOT NUM_CORES EQUAL 0) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
# HT? # HT?
set(NUM_THREADS ${NUM_CORES}) set(NUM_THREADS ${NUM_CORES})
else () else ()
@ -159,6 +179,9 @@ endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
endif () endif ()
if (NO_LAPACK) if (NO_LAPACK)
@ -207,6 +230,10 @@ if (CONSISTENT_FPCSR)
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
endif () endif ()
if (USE_TLS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
endif ()
# Only for development # Only for development
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
@ -224,6 +251,8 @@ endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (USE_SIMPLE_THREADED_LEVEL3) if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif () endif ()
@ -291,6 +320,8 @@ if (MIXED_MEMORY_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
endif () endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
set(REVISION "-r${OpenBLAS_VERSION}") set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})

View File

@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT) set(HOST_OS WINNT)
endif () endif ()
if (${HOST_OS} STREQUAL "LINUX")
# check if we're building natively on Android (TERMUX)
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
endif(${OPERATING_SYSTEM} MATCHES "Android")
endif()
if(CMAKE_COMPILER_IS_GNUCC AND WIN32) if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
@ -66,3 +76,12 @@ else()
set(BINARY32 1) set(BINARY32 1)
endif() endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
endif()

View File

@ -93,7 +93,7 @@ extern "C" {
#include <sched.h> #include <sched.h>
#endif #endif
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
#include <sched.h> #include <sched.h>
#endif #endif
@ -105,6 +105,10 @@ extern "C" {
#endif #endif
#endif #endif
#ifdef OS_HAIKU
#define NO_SYSV_IPC
#endif
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
#ifdef ATOM #ifdef ATOM
#define GOTO_ATOM ATOM #define GOTO_ATOM ATOM
@ -179,7 +183,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL #define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) #define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
#ifdef NEEDBUNDERSCORE #ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_ #define BLASFUNC(FUNC) FUNC##_
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
#ifdef USE64BITINT #ifdef USE64BITINT
typedef BLASLONG blasint; typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
#define blasabs(x) llabs(x)
#else
#define blasabs(x) labs(x)
#endif
#else #else
typedef int blasint; typedef int blasint;
#define blasabs(x) abs(x)
#endif #endif
#else #else
#ifdef USE64BITINT #ifdef USE64BITINT
@ -642,6 +652,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void); void gotoblas_profile_quit(void);
#ifdef USE_OPENMP #ifdef USE_OPENMP
#ifndef C_MSVC #ifndef C_MSVC
int omp_in_parallel(void); int omp_in_parallel(void);
int omp_get_num_procs(void); int omp_get_num_procs(void);
@ -649,6 +660,21 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif #endif
#if (__STDC_VERSION__ >= 201112L)
#if defined(C_GCC) && ( __GNUC__ < 7)
// workaround for GCC bug 65467
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#include <stdatomic.h>
#else
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#else #else
#ifdef __ELF__ #ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak)); int omp_in_parallel (void) __attribute__ ((weak));

View File

@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
extern "C" { extern "C" {
#endif #endif
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
float * A, BLASLONG strideA,
float * B, BLASLONG strideB,
float * R, BLASLONG strideR);
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,

View File

@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
#define RPCC_DEFINED #define RPCC_DEFINED
#ifndef NO_AFFINITY #ifndef NO_AFFINITY
#define WHEREAMI //#define WHEREAMI
static inline int WhereAmI(void){ static inline int WhereAmI(void){
int ret=0; int ret=0;
__asm__ __volatile__(".set push \n" __asm__ __volatile__(".set push \n"

View File

@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* - large enough to support all architectures and kernel * - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing. * Chosing a too small SIZE will lead to a stack smashing.
*/ */
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \ /* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \ /* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \ volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
stack_alloc_size = 0; \ STACK_ALLOC_PROTECT_SET \
STACK_ALLOC_PROTECT_SET \ /* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else #else
//Original OpenBLAS/GotoBLAS codes. //Original OpenBLAS/GotoBLAS codes.

View File

@ -178,6 +178,12 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
result = x/y; result = x/y;
return result; return result;
#else #else
#if (MAX_CPU_NUMBER > 64)
if ( y > 64) {
result = x/y;
return result;
}
#endif
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
@ -327,7 +333,7 @@ REALNAME:
#endif #endif
#endif #endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 16; \ .align 16; \

View File

@ -60,8 +60,13 @@
#endif #endif
*/ */
#define MB #ifdef __GNUC__
#define WMB #define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif
static void __inline blas_lock(volatile BLASULONG *address){ static void __inline blas_lock(volatile BLASULONG *address){
@ -129,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
"=b" (*ebx), "=b" (*ebx),
"=c" (*ecx), "=c" (*ecx),
"=d" (*edx) "=d" (*edx)
: "0" (op)); : "0" (op), "c"(0));
#endif #endif
} }
@ -196,6 +201,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x; if (y <= 1) return x;
#if (MAX_CPU_NUMBER > 64)
if (y > 64) {
result = x / y;
return result;
}
#endif
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
@ -403,7 +415,7 @@ REALNAME:
#define EPILOGUE .end #define EPILOGUE .end
#endif #endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 512; \ .align 512; \

View File

@ -53,6 +53,7 @@
#define VENDOR_SIS 8 #define VENDOR_SIS 8
#define VENDOR_TRANSMETA 9 #define VENDOR_TRANSMETA 9
#define VENDOR_NSC 10 #define VENDOR_NSC 10
#define VENDOR_HYGON 11
#define VENDOR_UNKNOWN 99 #define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@ -115,6 +116,8 @@
#define CORE_STEAMROLLER 25 #define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26 #define CORE_EXCAVATOR 26
#define CORE_ZEN 27 #define CORE_ZEN 27
#define CORE_SKYLAKEX 28
#define CORE_DHYANA 29
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -137,6 +140,8 @@
#define HAVE_AVX (1 << 18) #define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19) #define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20) #define HAVE_FMA3 (1 << 20)
#define HAVE_AVX512VL (1 << 21)
#define HAVE_AVX2 (1 << 22)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@ -211,5 +216,9 @@ typedef struct {
#define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51 #define CPUTYPE_ZEN 51
#define CPUTYPE_SKYLAKEX 52
#define CPUTYPE_DHYANA 53
#define CPUTYPE_HYGON_UNKNOWN 54
#endif #endif

View File

@ -34,7 +34,7 @@
#define CPU_CORTEXA15 4 #define CPU_CORTEXA15 4
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"ARMV6", "ARMV6",
"ARMV7", "ARMV7",
"CORTEXA9", "CORTEXA9",

View File

@ -29,25 +29,37 @@
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_ARMV8 1 #define CPU_ARMV8 1
#define CPU_CORTEXA57 2 // Arm
#define CPU_VULCAN 3 #define CPU_CORTEXA53 2
#define CPU_THUNDERX 4 #define CPU_CORTEXA57 3
#define CPU_THUNDERX2T99 5 #define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
// Qualcomm
#define CPU_FALKOR 6
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
static char *cpuname[] = { static char *cpuname[] = {
"UNKNOWN", "UNKNOWN",
"ARMV8" , "ARMV8" ,
"CORTEXA53",
"CORTEXA57", "CORTEXA57",
"VULCAN", "CORTEXA72",
"CORTEXA73",
"FALKOR",
"THUNDERX", "THUNDERX",
"THUNDERX2T99" "THUNDERX2T99"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
"unknown", "unknown",
"armv8" , "armv8",
"cortexa53",
"cortexa57", "cortexa57",
"vulcan", "cortexa72",
"cortexa73",
"falkor",
"thunderx", "thunderx",
"thunderx2t99" "thunderx2t99"
}; };
@ -114,13 +126,24 @@ int detect(void)
fclose(infile); fclose(infile);
if(cpu_part != NULL && cpu_implementer != NULL) { if(cpu_part != NULL && cpu_implementer != NULL) {
if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) // Arm
return CPU_CORTEXA57; if (strstr(cpu_implementer, "0x41")) {
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) if (strstr(cpu_part, "0xd03"))
return CPU_VULCAN; return CPU_CORTEXA53;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) else if (strstr(cpu_part, "0xd07"))
return CPU_CORTEXA57;
else if (strstr(cpu_part, "0xd08"))
return CPU_CORTEXA72;
else if (strstr(cpu_part, "0xd09"))
return CPU_CORTEXA73;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
return CPU_FALKOR;
// Cavium
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
return CPU_THUNDERX; return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99; return CPU_THUNDERX2T99;
} }
@ -179,64 +202,63 @@ void get_subdirname(void)
void get_cpuconfig(void) void get_cpuconfig(void)
{ {
// All arches should define ARMv8
printf("#define ARMV8\n");
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
int d = detect(); int d = detect();
switch (d) switch (d)
{ {
case CPU_CORTEXA53:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8: case CPU_ARMV8:
printf("#define ARMV8\n"); // Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n"); printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_VULCAN:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break; break;
case CPU_CORTEXA57: case CPU_CORTEXA57:
printf("#define CORTEXA57\n"); case CPU_CORTEXA72:
printf("#define HAVE_VFP\n"); case CPU_CORTEXA73:
printf("#define HAVE_VFPV3\n"); // Common minimum settings for these Arm cores
printf("#define HAVE_NEON\n"); // Can change a lot, but we need to be conservative
printf("#define HAVE_VFPV4\n"); // TODO: detect info from /sys if possible
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n"); printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
break; break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX: case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n"); printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L1_DATA_LINESIZE 128\n");
@ -248,11 +270,7 @@ void get_cpuconfig(void)
break; break;
case CPU_THUNDERX2T99: case CPU_THUNDERX2T99:
printf("#define VULCAN \n"); printf("#define THUNDERX2T99 \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n");

View File

@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_P5600 1 #define CPU_P5600 1
#define CPU_1004K 2
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"P5600" "P5600",
"1004K"
}; };
int detect(void){ int detect(void){
@ -90,7 +92,7 @@ int detect(void){
if (!strncmp("cpu", buffer, 3)){ if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
#if 0 #if 0
fprintf(stderr, "%s\n", p); fprintf(stderr, "%s \n", p);
#endif #endif
break; break;
} }
@ -99,43 +101,13 @@ int detect(void){
fclose(infile); fclose(infile);
if(p != NULL){ if(p != NULL){
if (strstr(p, "Loongson-3A")){ if (strstr(p, "5600")) {
return CPU_LOONGSON3A; return CPU_P5600;
}else if(strstr(p, "Loongson-3B")){ } else if (strstr(p, "1004K")) {
return CPU_LOONGSON3B; return CPU_1004K;
}else if (strstr(p, "Loongson-3")){ } else
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif #endif
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
@ -149,7 +121,7 @@ void get_architecture(void){
} }
void get_subarchitecture(void){ void get_subarchitecture(void){
if(detect()==CPU_P5600){ if(detect()==CPU_P5600|| detect()==CPU_1004K){
printf("P5600"); printf("P5600");
}else{ }else{
printf("UNKNOWN"); printf("UNKNOWN");
@ -170,6 +142,14 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n"); printf("#define L2_ASSOCIATIVE 8\n");
} else if (detect()==CPU_1004K) {
printf("#define MIPS1004K\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 26144\n");
printf("#define DTB_DEFAULT_ENTRIES 8\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{ }else{
printf("#define UNKNOWN\n"); printf("#define UNKNOWN\n");
} }
@ -178,6 +158,8 @@ void get_cpuconfig(void){
void get_libname(void){ void get_libname(void){
if(detect()==CPU_P5600) { if(detect()==CPU_P5600) {
printf("p5600\n"); printf("p5600\n");
} else if (detect()==CPU_1004K) {
printf("1004K\n");
}else{ }else{
printf("mips\n"); printf("mips\n");
} }

View File

@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_I6500 6 #define CPU_I6500 6
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"SICORTEX", "SICORTEX",
"LOONGSON3A", "LOONGSON3A",
"LOONGSON3B", "LOONGSON3B",

View File

@ -56,6 +56,7 @@
#define CPUTYPE_CELL 6 #define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7 #define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8 #define CPUTYPE_POWER8 8
#define CPUTYPE_POWER9 9
char *cpuname[] = { char *cpuname[] = {
"UNKNOWN", "UNKNOWN",
@ -66,7 +67,8 @@ char *cpuname[] = {
"POWER6", "POWER6",
"CELL", "CELL",
"PPCG4", "PPCG4",
"POWER8" "POWER8",
"POWER9"
}; };
char *lowercpuname[] = { char *lowercpuname[] = {
@ -78,7 +80,8 @@ char *lowercpuname[] = {
"power6", "power6",
"cell", "cell",
"ppcg4", "ppcg4",
"power8" "power8",
"power9"
}; };
char *corename[] = { char *corename[] = {
@ -90,6 +93,7 @@ char *corename[] = {
"POWER6", "POWER6",
"CELL", "CELL",
"PPCG4", "PPCG4",
"POWER8",
"POWER8" "POWER8"
}; };
@ -120,6 +124,7 @@ int detect(void){
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
@ -127,6 +132,33 @@ int detect(void){
#endif #endif
#ifdef _AIX #ifdef _AIX
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = popen("prtconf|grep 'Processor Type'", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Pro", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
pclose(infile);
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
return CPUTYPE_POWER5; return CPUTYPE_POWER5;
#endif #endif
@ -142,6 +174,52 @@ int detect(void){
return CPUTYPE_PPC970; return CPUTYPE_PPC970;
#endif #endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
int id;
__asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return CPUTYPE_POWER8;
break;
case 0x4d:
case 0x4b: // POWER8/8E
return CPUTYPE_POWER8;
break;
case 0x4a:
case 0x3f: // POWER7/7E
return CPUTYPE_POWER6;
break;
case 0x3e:
return CPUTYPE_POWER6;
break;
case 0x3a:
return CPUTYPE_POWER5;
break;
case 0x35:
case 0x38: // POWER4 /4+
return CPUTYPE_POWER4;
break;
case 0x40:
case 0x41: // POWER3 /3+
return CPUTYPE_POWER3;
break;
case 0x39:
case 0x3c:
case 0x44:
case 0x45:
return CPUTYPE_PPC970;
break;
case 0x70:
return CPUTYPE_CELL;
break;
case 0x8003:
return CPUTYPE_PPCG4;
break;
default:
return CPUTYPE_UNKNOWN;
}
#endif
} }
void get_architecture(void){ void get_architecture(void){

View File

@ -57,3 +57,8 @@ void get_cpuconfig(void){
void get_libname(void){ void get_libname(void){
printf("v9\n"); printf("v9\n");
} }
char *get_corename(void){
return "sparc";
}

View File

@ -50,6 +50,8 @@
#ifdef NO_AVX #ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM #define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
#define CORE_SKYLAKEX CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
@ -95,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
("mov %%ebx, %%edi;" ("mov %%ebx, %%edi;"
"cpuid;" "cpuid;"
"xchgl %%ebx, %%edi;" "xchgl %%ebx, %%edi;"
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
#else #else
__asm__ __volatile__ __asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
#endif #endif
} }
@ -209,6 +211,44 @@ int support_avx(){
#endif #endif
} }
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#ifndef NO_AVX512
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & 32) != 32){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){ int get_vendor(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
@ -231,6 +271,7 @@ int get_vendor(void){
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@ -292,6 +333,8 @@ int get_cputype(int gettype){
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX #ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX; if (support_avx()) feature |= HAVE_AVX;
if (support_avx2()) feature |= HAVE_AVX2;
if (support_avx512()) feature |= HAVE_AVX512VL;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif #endif
@ -1004,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
} }
} }
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { if ((get_vendor() == VENDOR_AMD) ||
(get_vendor() == VENDOR_HYGON) ||
(get_vendor() == VENDOR_CENTAUR)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx); cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
LDTB.size = 4096; LDTB.size = 4096;
@ -1226,22 +1271,18 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
case 15: case 15:
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 13: case 13:
//Broadwell //Broadwell
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
@ -1250,33 +1291,27 @@ int get_cpuname(void){
switch (model) { switch (model) {
case 5: case 5:
case 6: case 6:
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 7: case 7:
case 15: case 15:
//Broadwell //Broadwell
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 14: case 14:
//Skylake //Skylake
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
@ -1290,33 +1325,36 @@ int get_cpuname(void){
switch (model) { switch (model) {
case 6: case 6:
//Broadwell //Broadwell
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 5: case 5:
// Skylake X
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14: case 14:
// Skylake // Skylake
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 7: case 7:
// Xeon Phi Knights Landing // Xeon Phi Knights Landing
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
@ -1324,16 +1362,27 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 6:
switch (model) {
case 6: // Cannon Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9: case 9:
case 8: case 8:
switch (model) { switch (model) {
case 14: // Kaby Lake case 14: // Kaby Lake
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
@ -1420,6 +1469,8 @@ int get_cpuname(void){
switch (model) { switch (model) {
case 1: case 1:
// AMD Ryzen // AMD Ryzen
case 8:
// AMD Ryzen2
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CPUTYPE_ZEN; return CPUTYPE_ZEN;
@ -1435,6 +1486,26 @@ int get_cpuname(void){
return CPUTYPE_AMD_UNKNOWN; return CPUTYPE_AMD_UNKNOWN;
} }
if (vendor == VENDOR_HYGON){
switch (family) {
case 0xf:
switch (exfamily) {
case 9:
//Hygon Dhyana
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
}
break;
}
return CPUTYPE_HYGON_UNKNOWN;
}
if (vendor == VENDOR_CYRIX){ if (vendor == VENDOR_CYRIX){
switch (family) { switch (family) {
case 0x4: case 0x4:
@ -1556,6 +1627,8 @@ static char *cpuname[] = {
"STEAMROLLER", "STEAMROLLER",
"EXCAVATOR", "EXCAVATOR",
"ZEN", "ZEN",
"SKYLAKEX",
"DHYANA"
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1610,10 +1683,12 @@ static char *lowercpuname[] = {
"steamroller", "steamroller",
"excavator", "excavator",
"zen", "zen",
"skylakex",
"dhyana"
}; };
static char *corename[] = { static char *corename[] = {
"UNKOWN", "UNKNOWN",
"80486", "80486",
"P5", "P5",
"P6", "P6",
@ -1641,6 +1716,8 @@ static char *corename[] = {
"STEAMROLLER", "STEAMROLLER",
"EXCAVATOR", "EXCAVATOR",
"ZEN", "ZEN",
"SKYLAKEX",
"DHYANA"
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1672,6 +1749,8 @@ static char *corename_lower[] = {
"steamroller", "steamroller",
"excavator", "excavator",
"zen", "zen",
"skylakex",
"dhyana"
}; };
@ -1860,6 +1939,19 @@ int get_coretype(void){
else else
return CORE_NEHALEM; return CORE_NEHALEM;
case 5: case 5:
// Skylake X
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
case 14: case 14:
// Skylake // Skylake
if(support_avx()) if(support_avx())
@ -1958,6 +2050,8 @@ int get_coretype(void){
switch (model) { switch (model) {
case 1: case 1:
// AMD Ryzen // AMD Ryzen
case 8:
// Ryzen 2
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CORE_ZEN; return CORE_ZEN;
@ -1973,6 +2067,23 @@ int get_coretype(void){
} }
} }
if (vendor == VENDOR_HYGON){
if (family == 0xf){
if (exfamily == 9) {
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
#else
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CORE_BARCELONA;
} else {
return CORE_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) { if (vendor == VENDOR_CENTAUR) {
switch (family) { switch (family) {
case 0x6: case 0x6:
@ -2059,6 +2170,8 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
@ -2127,6 +2240,8 @@ void get_sse(void){
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");

View File

@ -29,15 +29,18 @@
#define CPU_GENERIC 0 #define CPU_GENERIC 0
#define CPU_Z13 1 #define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = { static char *cpuname[] = {
"ZARCH_GENERIC", "ZARCH_GENERIC",
"Z13" "Z13",
"Z14"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
"zarch_generic", "zarch_generic",
"z13" "z13",
"z14"
}; };
int detect(void) int detect(void)
@ -62,6 +65,10 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13;
/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
return CPU_GENERIC; return CPU_GENERIC;
} }
@ -107,5 +114,9 @@ void get_cpuconfig(void)
printf("#define Z13\n"); printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
break; break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
} }
} }

12
ctest.c
View File

@ -60,6 +60,14 @@ OS_FREEBSD
OS_NETBSD OS_NETBSD
#endif #endif
#if defined(__OpenBSD__)
OS_OPENBSD
#endif
#if defined(__DragonFly__)
OS_DRAGONFLY
#endif
#if defined(__sun) #if defined(__sun)
OS_SUNOS OS_SUNOS
#endif #endif
@ -93,6 +101,10 @@ OS_INTERIX
OS_LINUX OS_LINUX
#endif #endif
#if defined(__HAIKU__)
OS_HAIKU
#endif
#if defined(__i386) || defined(_X86) #if defined(__i386) || defined(_X86)
ARCH_X86 ARCH_X86
#endif #endif

View File

@ -102,7 +102,13 @@ clean ::
rm -f x* rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB = ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif
# Single real # Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

View File

@ -62,9 +62,36 @@
#endif #endif
#endif #endif
#ifndef TRANSA #ifndef thread_local
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
# define thread_local _Thread_local
# elif defined _WIN32 && ( \
defined _MSC_VER || \
defined __ICL || \
defined __DMC__ || \
defined __BORLANDC__ )
# define thread_local __declspec(thread)
/* note that ICC (linux) and Clang are covered by __GNUC__ */
# elif defined __GNUC__ || \
defined __SUNPRO_C || \
defined __xlC__
# define thread_local __thread
# else
# define UNSAFE
#endif
#endif
#if defined USE_OPENMP
#undef UNSAFE
#endif
#if !defined(TRANSA) && !defined(UNSAFE)
#define Y_DUMMY_NUM 1024 #define Y_DUMMY_NUM 1024
#if defined(USE_OPENMP)
static FLOAT y_dummy[Y_DUMMY_NUM]; static FLOAT y_dummy[Y_DUMMY_NUM];
#pragma omp threadprivate(y_dummy)
# else
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
# endif
#endif #endif
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifdef TRANSA #ifdef TRANSA
y += n_from * incy * COMPSIZE; y += n_from * incy * COMPSIZE;
#else #else
# ifndef UNSAFE
//for split matrix row (n) direction and vector x of gemv_n //for split matrix row (n) direction and vector x of gemv_n
x += n_from * incx * COMPSIZE; x += n_from * incx * COMPSIZE;
//store partial result for every thread //store partial result for every thread
y += (m_to - m_from) * 1 * COMPSIZE * pos; y += (m_to - m_from) * 1 * COMPSIZE * pos;
# endif
#endif #endif
} }
@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;
#ifndef TRANSA #if !defined(TRANSA) && !defined(UNSAFE)
int split_x=0; int split_x=0;
#endif #endif
@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
i -= width; i -= width;
} }
#ifndef TRANSA #if !defined(TRANSA) && !defined(UNSAFE)
//try to split matrix on row direction and x. //try to split matrix on row direction and x.
//Then, reduction. //Then, reduction.
if (num_cpu < nthreads) { if (num_cpu < nthreads) {
@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
exec_blas(num_cpu, queue); exec_blas(num_cpu, queue);
} }
#ifndef TRANSA #if !defined(TRANSA) && !defined(UNSAFE)
if(split_x==1){ if(split_x==1){
//reduction //reduction
for(i=0; i<num_cpu; i++){ for(i=0; i<num_cpu; i++){

View File

@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }
/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */ for (is = 0; is < m; is += DTB_ENTRIES){
/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */
for (is = 0; is < m; is += DTB_ENTRIES * 100){ min_i = MIN(m - is, DTB_ENTRIES);
min_i = MIN(m - is, DTB_ENTRIES * 100);
#ifndef TRANSA #ifndef TRANSA
if (is > 0){ if (is > 0){
fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n");
GEMV_N(is, min_i, 0, dp1, GEMV_N(is, min_i, 0, dp1,
a + is * lda, lda, a + is * lda, lda,
B + is, 1, B + is, 1,

View File

@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)

View File

@ -91,7 +91,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;

View File

@ -67,7 +67,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;

View File

@ -48,6 +48,10 @@
#define SWITCH_RATIO 2 #define SWITCH_RATIO 2
#endif #endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
//The array of job_t may overflow the stack. //The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t. //Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -91,7 +95,8 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; volatile
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;
@ -346,7 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using workspace */ /* Make sure if no one is using workspace */
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1); STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING) #if defined(FUSED_GEMM) && !defined(TIMING)
@ -408,7 +413,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */ /* Wait until other region of B is initialized */
START_RPCC(); START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
/* Apply kernel with local region of A and part of other region of B */ /* Apply kernel with local region of A and part of other region of B */
@ -426,6 +431,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */ /* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
} }
} }
} while (current != mypos); } while (current != mypos);
@ -487,7 +493,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) { for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
} }
} }
STOP_RPCC(waiting3); STOP_RPCC(waiting3);
@ -508,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
return 0; return 0;
} }
static int round_up(int remainder, int width, int multiple)
{
if (multiple > remainder || width <= multiple)
return width;
width = (width + multiple - 1) / multiple;
width = width * multiple;
return width;
}
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb, *range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) { BLASLONG nthreads_m, BLASLONG nthreads_n) {
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg; blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP #ifndef USE_ALLOC_HEAP
@ -552,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
#endif #endif
#endif #endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
#ifdef USE_ALLOC_HEAP #ifdef USE_ALLOC_HEAP
/* Dynamically allocate workspace */ /* Dynamically allocate workspace */
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
@ -599,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0; num_parts = 0;
while (m > 0){ while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
width = round_up(m, width, GEMM_PREFERED_SIZE);
m -= width; m -= width;
if (m < 0) width = width + m; if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width; range_M[num_parts + 1] = range_M[num_parts] + width;
num_parts ++; num_parts ++;
} }
for (i = num_parts; i < MAX_CPU_NUMBER; i++) { for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@ -643,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
if (width < SWITCH_RATIO) { if (width < SWITCH_RATIO) {
width = SWITCH_RATIO; width = SWITCH_RATIO;
} }
width = round_up(n, width, GEMM_PREFERED_SIZE);
n -= width; n -= width;
if (n < 0) width = width + n; if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width; range_N[num_parts + 1] = range_N[num_parts] + width;
num_parts ++; num_parts ++;
} }
for (j = num_parts; j < MAX_CPU_NUMBER; j++) { for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
@ -653,8 +694,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
} }
/* Clear synchronization flags */ /* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) { for (i = 0; i < nthreads; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) { for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) { for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0; job[i].working[j][CACHE_LINE_SIZE * k] = 0;
} }
@ -669,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job); free(job);
#endif #endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0; return 0;
} }

View File

@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
BLASLONG width, i; BLASLONG width, i;
BLASLONG n_from, n_to; BLASLONG n_from, n_to;
double dnum, nf, nt, di; double dnum, nf, nt, di, dinum;
int num_cpu; int num_cpu;
int mask = 0; int mask = 0;
@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) { if (nthreads - num_cpu > 1) {
di = (double)i; di = (double)i;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); dinum = di * di +dnum;
if (dinum <0)
width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1);
else
width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i; if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
nf = (double)(arg -> n - n_from); nf = (double)(arg -> n - n_from);
nt = (double)(arg -> n - n_to); nt = (double)(arg -> n - n_to);
dnum = (nt * nt - nf * nf) / (double)nthreads; dnum = (nt * nt - nf * nf) / (double)nthreads;
num_cpu = 0; num_cpu = 0;
range[0] = n_from; range[0] = n_from;
@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) { if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i); di = (double)(arg -> n - i);
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); dinum = di * di + dnum;
if (dinum<0)
width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1);
else
width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i; if ((width <= 0) || (width > n_to - i)) width = n_to - i;
} else { } else {

View File

@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic.c) if (ARM64)
list(APPEND COMMON_SOURCES dynamic_arm64.c)
else ()
list(APPEND COMMON_SOURCES dynamic.c)
endif ()
else () else ()
list(APPEND COMMON_SOURCES parameter.c) list(APPEND COMMON_SOURCES parameter.c)
endif () endif ()

View File

@ -15,7 +15,11 @@ endif
# COMMONOBJS += info.$(SUFFIX) # COMMONOBJS += info.$(SUFFIX)
ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
COMMONOBJS += dynamic_arm64.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX) COMMONOBJS += dynamic.$(SUFFIX)
endif
else else
COMMONOBJS += parameter.$(SUFFIX) COMMONOBJS += parameter.$(SUFFIX)
endif endif
@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c
endif endif
ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
else else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif endif

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/ /*********************************************************************/
#include "common.h" #include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
#include <dlfcn.h> #include <dlfcn.h>
#include <signal.h> #include <signal.h>
#include <sys/resource.h> #include <sys/resource.h>
@ -582,7 +582,7 @@ int blas_thread_init(void){
if(ret!=0){ if(ret!=0){
struct rlimit rlim; struct rlimit rlim;
const char *msg = strerror(ret); const char *msg = strerror(ret);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
#ifdef RLIMIT_NPROC #ifdef RLIMIT_NPROC
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
long i; long i;
#ifdef SMP_SERVER
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY #ifndef NO_AFFINITY

View File

@ -36,6 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
//#include <sys/mman.h> //#include <sys/mman.h>
@ -47,13 +48,22 @@
#else #else
#ifndef OMP_SCHED
#define OMP_SCHED static
#endif
int blas_server_avail = 0; int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER]; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#if __STDC_VERSION__ >= 201112L
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#else
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#endif
void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {
int i=0; int i=0, j=0;
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
@ -68,15 +78,17 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread //adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
if(blas_thread_buffer[i]==NULL){ for(j=0; j<blas_cpu_number; j++){
blas_thread_buffer[i]=blas_memory_alloc(2); if(blas_thread_buffer[i][j]==NULL){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
} }
} for(; j<MAX_CPU_NUMBER; j++){
for(; i<MAX_CPU_NUMBER; i++){ if(blas_thread_buffer[i][j]!=NULL){
if(blas_thread_buffer[i]!=NULL){ blas_memory_free(blas_thread_buffer[i][j]);
blas_memory_free(blas_thread_buffer[i]); blas_thread_buffer[i][j]=NULL;
blas_thread_buffer[i]=NULL; }
} }
} }
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)
@ -92,30 +104,34 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){ int blas_thread_init(void){
int i=0; int i=0, j=0;
blas_get_cpu_number(); blas_get_cpu_number();
blas_server_avail = 1; blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
blas_thread_buffer[i]=blas_memory_alloc(2); for(j=0; j<blas_num_threads; j++){
} blas_thread_buffer[i][j]=blas_memory_alloc(2);
for(; i<MAX_CPU_NUMBER; i++){ }
blas_thread_buffer[i]=NULL; for(; j<MAX_CPU_NUMBER; j++){
blas_thread_buffer[i][j]=NULL;
}
} }
return 0; return 0;
} }
int BLASFUNC(blas_thread_shutdown)(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i=0; int i=0, j=0;
blas_server_avail = 0; blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
if(blas_thread_buffer[i]!=NULL){ for(j=0; j<MAX_CPU_NUMBER; j++){
blas_memory_free(blas_thread_buffer[i]); if(blas_thread_buffer[i][j]!=NULL){
blas_thread_buffer[i]=NULL; blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
} }
} }
@ -206,7 +222,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} }
} }
static void exec_threads(blas_queue_t *queue){ static void exec_threads(blas_queue_t *queue, int buf_index){
void *buffer, *sa, *sb; void *buffer, *sa, *sb;
int pos=0, release_flag=0; int pos=0, release_flag=0;
@ -223,7 +239,7 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
pos = omp_get_thread_num(); pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos]; buffer = blas_thread_buffer[buf_index][pos];
//fallback //fallback
if(buffer==NULL) { if(buffer==NULL) {
@ -291,7 +307,7 @@ static void exec_threads(blas_queue_t *queue){
int exec_blas(BLASLONG num, blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){
BLASLONG i; BLASLONG i, buf_index;
if ((num <= 0) || (queue == NULL)) return 0; if ((num <= 0) || (queue == NULL)) return 0;
@ -302,16 +318,39 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
} }
#endif #endif
#pragma omp parallel for schedule(static) while(true) {
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
#if __STDC_VERSION__ >= 201112L
_Bool inuse = false;
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
#else
if(blas_buffer_inuse[i] == false) {
blas_buffer_inuse[i] = true;
#endif
buf_index = i;
break;
}
}
if(i != MAX_PARALLEL_NUMBER)
break;
}
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) { for (i = 0; i < num; i ++) {
#ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef USE_SIMPLE_THREADED_LEVEL3
queue[i].position = i; queue[i].position = i;
#endif #endif
exec_threads(&queue[i]); exec_threads(&queue[i], buf_index);
} }
#if __STDC_VERSION__ >= 201112L
atomic_store(&blas_buffer_inuse[buf_index], false);
#else
blas_buffer_inuse[buf_index] = false;
#endif
return 0; return 0;
} }

View File

@ -40,6 +40,14 @@
#include <stdlib.h> #include <stdlib.h>
#include "common.h" #include "common.h"
#if defined(OS_CYGWIN_NT) && !defined(unlikely)
#ifdef __GNUC__
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define unlikely(x) (x)
#endif
#endif
/* This is a thread implementation for Win32 lazy implementation */ /* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */ /* Thread server common infomation */
@ -53,7 +61,7 @@ typedef struct{
} blas_pool_t; } blas_pool_t;
/* We need this grobal for cheking if initialization is finished. */ /* We need this global for cheking if initialization is finished. */
int blas_server_avail = 0; int blas_server_avail = 0;
/* Local Variables */ /* Local Variables */
@ -340,6 +348,11 @@ int blas_thread_init(void){
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
blas_queue_t *current; blas_queue_t *current;
current = queue; current = queue;
@ -405,6 +418,11 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
/* Execute Threads */ /* Execute Threads */
int exec_blas(BLASLONG num, blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
#ifndef ALL_THREADED #ifndef ALL_THREADED
int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
#endif #endif
@ -460,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
void goto_set_num_threads(int num_threads) void goto_set_num_threads(int num_threads)
{ {
long i; long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number; if (num_threads < 1) num_threads = blas_cpu_number;

View File

@ -49,6 +49,167 @@
#define EXTERN #define EXTERN
#endif #endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_PRESCOTT;
#ifdef DYN_ATHLON
extern gotoblas_t gotoblas_ATHLON;
#else
#define gotoblas_ATHLON gotoblas_PRESCOTT
#endif
#ifdef DYN_KATMAI
extern gotoblas_t gotoblas_KATMAI;
#else
#define gotoblas_KATMAI gotoblas_PRESCOTT
#endif
#ifdef DYN_BANIAS
extern gotoblas_t gotoblas_BANIAS;
#else
#define gotoblas_BANIAS gotoblas_PRESCOTT
#endif
#ifdef DYN_COPPERMINE
extern gotoblas_t gotoblas_COPPERMINE;
#else
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
#endif
#ifdef DYN_NORTHWOOD
extern gotoblas_t gotoblas_NORTHWOOD;
#else
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
#endif
#ifdef DYN_CORE2
extern gotoblas_t gotoblas_CORE2;
#else
#define gotoblas_CORE2 gotoblas_PRESCOTT
#endif
#ifdef DYN_NEHALEM
extern gotoblas_t gotoblas_NEHALEM;
#else
#define gotoblas_NEHALEM gotoblas_PRESCOTT
#endif
#ifdef DYN_BARCELONA
extern gotoblas_t gotoblas_BARCELONA;
#elif defined(DYN_NEHALEM)
#define gotoblas_BARCELONA gotoblas_NEHALEM
#else
#define gotoblas_BARCELONA gotoblas_PRESCOTT
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
#endif
#ifdef DYN_NANO
extern gotoblas_t gotoblas_NANO;
#else
#define gotoblas_NANO gotoblas_PRESCOTT
#endif
#ifdef DYN_PENRYN
extern gotoblas_t gotoblas_PENRYN;
#else
#define gotoblas_PENRYN gotoblas_PRESCOTT
#endif
#ifdef DYN_DUNNINGTON
extern gotoblas_t gotoblas_DUNNINGTON;
#else
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON
extern gotoblas_t gotoblas_OPTERON;
#else
#define gotoblas_OPTERON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON_SSE3
extern gotoblas_t gotoblas_OPTERON_SSE3;
#else
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
#endif
#ifdef DYN_BOBCAT
extern gotoblas_t gotoblas_BOBCAT;
#elif defined(DYN_NEHALEM)
#define gotoblas_BOBCAT gotoblas_NEHALEM
#else
#define gotoblas_BOBCAT gotoblas_PRESCOTT
#endif
#ifdef DYN_SANDYBRIDGE
extern gotoblas_t gotoblas_SANDYBRIDGE;
#elif defined(DYN_NEHALEM)
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#else
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
#endif
#ifdef DYN_BULLDOZER
extern gotoblas_t gotoblas_BULLDOZER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_BULLDOZER gotoblas_NEHALEM
#else
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
#endif
#ifdef DYN_PILEDRIVER
extern gotoblas_t gotoblas_PILEDRIVER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
#else
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
#endif
#ifdef DYN_STEAMROLLER
extern gotoblas_t gotoblas_STEAMROLLER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
#else
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
#endif
#ifdef DYN_EXCAVATOR
extern gotoblas_t gotoblas_EXCAVATOR;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
#else
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
#endif
#ifdef DYN_HASWELL
extern gotoblas_t gotoblas_HASWELL;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_HASWELL gotoblas_NEHALEM
#else
#define gotoblas_HASWELL gotoblas_PRESCOTT
#endif
#ifdef DYN_ZEN
extern gotoblas_t gotoblas_ZEN;
#elif defined(DYN_HASWELL)
#define gotoblas_ZEN gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_ZEN gotoblas_NEHALEM
#else
#define gotoblas_ZEN gotoblas_PRESCOTT
#endif
#ifdef DYN_SKYLAKEX
extern gotoblas_t gotoblas_SKYLAKEX;
#elif defined(DYN_HASWELL)
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
EXTERN gotoblas_t gotoblas_KATMAI; EXTERN gotoblas_t gotoblas_KATMAI;
EXTERN gotoblas_t gotoblas_COPPERMINE; EXTERN gotoblas_t gotoblas_COPPERMINE;
EXTERN gotoblas_t gotoblas_NORTHWOOD; EXTERN gotoblas_t gotoblas_NORTHWOOD;
@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS;
EXTERN gotoblas_t gotoblas_ATHLON; EXTERN gotoblas_t gotoblas_ATHLON;
extern gotoblas_t gotoblas_PRESCOTT; extern gotoblas_t gotoblas_PRESCOTT;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_BARCELONA;
#ifdef DYNAMIC_OLDER
extern gotoblas_t gotoblas_ATOM; extern gotoblas_t gotoblas_ATOM;
extern gotoblas_t gotoblas_NANO; extern gotoblas_t gotoblas_NANO;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_PENRYN; extern gotoblas_t gotoblas_PENRYN;
extern gotoblas_t gotoblas_DUNNINGTON; extern gotoblas_t gotoblas_DUNNINGTON;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_BOBCAT;
#else
#define gotoblas_ATOM gotoblas_NEHALEM
#define gotoblas_NANO gotoblas_NEHALEM
#define gotoblas_PENRYN gotoblas_CORE2
#define gotoblas_DUNNINGTON gotoblas_CORE2
#define gotoblas_OPTERON gotoblas_CORE2
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
#define gotoblas_BOBCAT gotoblas_CORE2
#endif
#ifndef NO_AVX #ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_BULLDOZER;
@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR; extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2 #ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else #else
extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN; extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif #endif
#else #else
//Use NEHALEM kernels for sandy bridge //Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA
@ -90,10 +269,12 @@ extern gotoblas_t gotoblas_ZEN;
#define gotoblas_ZEN gotoblas_BARCELONA #define gotoblas_ZEN gotoblas_BARCELONA
#endif #endif
#endif // DYNAMIC_LIST
#define VENDOR_INTEL 1 #define VENDOR_INTEL 1
#define VENDOR_AMD 2 #define VENDOR_AMD 2
#define VENDOR_CENTAUR 3 #define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_UNKNOWN 99 #define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@ -124,9 +305,49 @@ int support_avx(){
#endif #endif
} }
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#ifndef NO_AVX512
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1 #define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
static int get_vendor(void){ static int get_vendor(void){
@ -149,6 +370,7 @@ static int get_vendor(void){
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@ -223,18 +445,24 @@ static gotoblas_t *get_coretype(void){
} }
//Intel Haswell //Intel Haswell
if (model == 12 || model == 15) { if (model == 12 || model == 15) {
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
//Intel Broadwell //Intel Broadwell
if (model == 13) { if (model == 13) {
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
@ -244,27 +472,36 @@ static gotoblas_t *get_coretype(void){
case 4: case 4:
//Intel Haswell //Intel Haswell
if (model == 5 || model == 6) { if (model == 5 || model == 6) {
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
//Intel Broadwell //Intel Broadwell
if (model == 7 || model == 15) { if (model == 7 || model == 15) {
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
//Intel Skylake //Intel Skylake
if (model == 14) { if (model == 14) {
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
@ -277,27 +514,54 @@ static gotoblas_t *get_coretype(void){
case 5: case 5:
//Intel Broadwell //Intel Broadwell
if (model == 6) { if (model == 6) {
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
//Intel Skylake if (model == 5) {
if (model == 14 || model == 5) { // Intel Skylake X
if(support_avx()) if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ }
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
//Intel Phi Knights Landing //Intel Phi Knights Landing
if (model == 7) { if (model == 7) {
if(support_avx()) if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ }
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
@ -307,12 +571,29 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; return &gotoblas_NEHALEM;
} }
return NULL; return NULL;
case 6:
if (model == 6) {
// Cannon Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9: case 9:
case 8: case 8:
if (model == 14 ) { // Kaby Lake if (model == 14 ) { // Kaby Lake
if(support_avx()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
else{ if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
@ -325,7 +606,7 @@ static gotoblas_t *get_coretype(void){
} }
} }
if (vendor == VENDOR_AMD){ if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
if (family <= 0xe) { if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx); cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
@ -397,7 +678,7 @@ static gotoblas_t *get_coretype(void){
} }
} }
} else if (exfamily == 8) { } else if (exfamily == 8) {
if (model == 1) { if (model == 1 || model == 8) {
if(support_avx()) if(support_avx())
return &gotoblas_ZEN; return &gotoblas_ZEN;
else{ else{
@ -405,6 +686,13 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
} }
} }
} else if (exfamily == 9) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else { }else {
return &gotoblas_BARCELONA; return &gotoblas_BARCELONA;
} }
@ -445,7 +733,8 @@ static char *corename[] = {
"Haswell", "Haswell",
"Steamroller", "Steamroller",
"Excavator", "Excavator",
"Zen" "Zen",
"SkylakeX"
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -473,7 +762,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0]; return corename[0];
} }
@ -485,7 +774,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128]; char message[128];
//char mname[20]; //char mname[20];
for ( i=1 ; i <= 23; i++) for ( i=1 ; i <= 24; i++)
{ {
if (!strncasecmp(coretype,corename[i],20)) if (!strncasecmp(coretype,corename[i],20))
{ {
@ -503,6 +792,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found) switch (found)
{ {
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN); case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR); case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER); case 21: return (&gotoblas_STEAMROLLER);

View File

@ -0,0 +1,198 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#include <asm/hwcap.h>
#include <sys/auxv.h>
extern gotoblas_t gotoblas_ARMV8;
extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 4
/*
* In case asm/hwcap.h is outdated on the build system, make sure
* that HWCAP_CPUID is defined
*/
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#define get_cpu_ftr(id, var) ({ \
asm("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {
"armv8",
"cortexa57",
"thunderx",
"thunderx2t99",
"unknown"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
return corename[NUM_CORETYPES];
}
static gotoblas_t *force_coretype(char *coretype) {
int i ;
int found = -1;
char message[128];
for ( i=0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 0: return (&gotoblas_ARMV8);
case 1: return (&gotoblas_CORTEXA57);
case 2: return (&gotoblas_THUNDERX);
case 3: return (&gotoblas_THUNDERX2T99);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1;
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
char coremsg[128];
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
}
get_cpu_ftr(MIDR_EL1, midr_el1);
/*
* MIDR_EL1
*
* 31 24 23 20 19 16 15 4 3 0
* -----------------------------------------------------------------
* | Implementer | Variant | Architecture | Part Number | Revision |
* -----------------------------------------------------------------
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
switch(implementer)
{
case 0x41: // ARM
switch (part)
{
case 0xd07: // Cortex A57
case 0xd08: // Cortex A72
case 0xd03: // Cortex A53
return &gotoblas_CORTEXA57;
}
break;
case 0x42: // Broadcom
switch (part)
{
case 0x516: // Vulcan
return &gotoblas_THUNDERX2T99;
}
break;
case 0x43: // Cavium
switch (part)
{
case 0x0a1: // ThunderX
return &gotoblas_THUNDERX;
case 0x0af: // ThunderX2
return &gotoblas_THUNDERX2T99;
}
break;
}
return NULL;
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_ARMV8;
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

File diff suppressed because it is too large Load Diff

View File

@ -35,9 +35,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h> #include <string.h>
#if defined(_WIN32) && defined(_MSC_VER)
#if _MSC_VER < 1900
#define snprintf _snprintf
#endif
#endif
static char* openblas_config_str="" static char* openblas_config_str=""
"OpenBLAS "
VERSION
" "
#ifdef USE64BITINT #ifdef USE64BITINT
"USE64BITINT " " USE64BITINT "
#endif #endif
#ifdef NO_CBLAS #ifdef NO_CBLAS
"NO_CBLAS " "NO_CBLAS "
@ -54,6 +63,9 @@ static char* openblas_config_str=""
#ifdef NO_AFFINITY #ifdef NO_AFFINITY
"NO_AFFINITY " "NO_AFFINITY "
#endif #endif
#ifdef USE_OPENMP
"USE_OPENMP "
#endif
#ifndef DYNAMIC_ARCH #ifndef DYNAMIC_ARCH
CHAR_CORENAME CHAR_CORENAME
#endif #endif
@ -61,18 +73,23 @@ static char* openblas_config_str=""
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
char *gotoblas_corename(); char *gotoblas_corename();
static char tmp_config_str[256];
#endif #endif
static char tmp_config_str[256];
int openblas_get_parallel();
char* CNAME() { char* CNAME() {
#ifndef DYNAMIC_ARCH char tmpstr[20];
return openblas_config_str;
#else
strcpy(tmp_config_str, openblas_config_str); strcpy(tmp_config_str, openblas_config_str);
#ifdef DYNAMIC_ARCH
strcat(tmp_config_str, gotoblas_corename()); strcat(tmp_config_str, gotoblas_corename());
return tmp_config_str;
#endif #endif
if (openblas_get_parallel() == 0)
sprintf(tmpstr, " SINGLE_THREADED");
else
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
strcat(tmp_config_str, tmpstr);
return tmp_config_str;
} }
@ -83,3 +100,4 @@ char* openblas_get_corename() {
return gotoblas_corename(); return gotoblas_corename();
#endif #endif
} }

View File

@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){ void blas_set_parameter(void){
int factor; int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
int size = 16; int size = 16;
#else #else
int size = get_L2_size(); int size = get_L2_size();
@ -730,35 +730,8 @@ void blas_set_parameter(void){
#if defined(ARCH_ARM64) #if defined(ARCH_ARM64)
#if defined(VULCAN) || defined(THUNDERX2T99)
unsigned long dgemm_prefetch_size_a;
unsigned long dgemm_prefetch_size_b;
unsigned long dgemm_prefetch_size_c;
#endif
void blas_set_parameter(void) void blas_set_parameter(void)
{ {
#if defined(VULCAN) || defined(THUNDERX2T99)
dgemm_p = 160;
dgemm_q = 128;
dgemm_r = 4096;
sgemm_p = 128;
sgemm_q = 352;
sgemm_r = 4096;
cgemm_p = 128;
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;
#endif
} }
#endif #endif

View File

@ -114,20 +114,22 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif endif
ifneq (,$(filter 1 2,$(NOFORTRAN))) ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran #only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif endif
dllinit.$(SUFFIX) : dllinit.c dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $< $(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
so : ../$(LIBSONAME) so : ../$(LIBSONAME)
ifeq ($(OSNAME), Android) ifeq ($(OSNAME), Android)
INTERNALNAME = $(LIBPREFIX).so INTERNALNAME = $(LIBPREFIX).so
FEXTRALIB += -lm
EXTRALIB += -lm
else else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
@ -156,7 +158,7 @@ endif
endif endif
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
so : ../$(LIBSONAME) so : ../$(LIBSONAME)

16
f_check
View File

@ -97,7 +97,7 @@ if ($compiler eq "") {
if ($data =~ /Intel/) { if ($data =~ /Intel/) {
$vendor = INTEL; $vendor = INTEL;
$openmp = "-openmp"; $openmp = "-fopenmp";
} }
if ($data =~ /Sun Fortran/) { if ($data =~ /Sun Fortran/) {
@ -127,7 +127,7 @@ if ($compiler eq "") {
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores. # for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ /zho_ge__/) { if ($data =~ / zho_ge__/) {
$need2bu = 1; $need2bu = 1;
} }
} }
@ -155,7 +155,7 @@ if ($compiler eq "") {
if ($compiler =~ /ifort/) { if ($compiler =~ /ifort/) {
$vendor = INTEL; $vendor = INTEL;
$bu = "_"; $bu = "_";
$openmp = "-openmp"; $openmp = "-fopenmp";
} }
if ($compiler =~ /pathf/) { if ($compiler =~ /pathf/) {
@ -292,9 +292,6 @@ if ($link ne "") {
&& ($flags !~ /^-LIST:/) && ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/) && ($flags !~ /^-LANG:/)
) { ) {
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= $flags . " "; $linker_L .= $flags . " ";
} }
@ -311,17 +308,11 @@ if ($link ne "") {
if ($flags =~ /^\-rpath\@/) { if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g; $flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ; $linker_L .= "-Wl,". $flags . " " ;
} }
if ($flags =~ /^\-rpath-link\@/) { if ($flags =~ /^\-rpath-link\@/) {
$flags =~ s/\@/\,/g; $flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ; $linker_L .= "-Wl,". $flags . " " ;
} }
@ -330,7 +321,6 @@ if ($link ne "") {
&& ($flags !~ /gfortranbegin/) && ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/) && ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/) && ($flags !~ /pathfstart/)
&& ($flags !~ /numa/)
&& ($flags !~ /crt[0-9]/) && ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/) && ($flags !~ /gcc/)
&& ($flags !~ /user32/) && ($flags !~ /user32/)

109
getarch.c
View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
#include <windows.h> #include <windows.h>
#endif #endif
#if defined(__FreeBSD__) || defined(__APPLE__) #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#include <sys/types.h> #include <sys/types.h>
#include <sys/sysctl.h> #include <sys/sysctl.h>
#endif #endif
@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "HASWELL" #define CORENAME "HASWELL"
#endif #endif
#ifdef FORCE_SKYLAKEX
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SKYLAKEX"
#define ARCHCONFIG "-DSKYLAKEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -912,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DARMV8 " \ #define ARCHCONFIG "-DARMV8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "armv8" #define LIBNAME "armv8"
#define CORENAME "ARMV8" #define CORENAME "ARMV8"
#endif #endif
#ifdef FORCE_CORTEXA53
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA53"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA53 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa53"
#define CORENAME "CORTEXA53"
#else
#endif
#ifdef FORCE_CORTEXA57 #ifdef FORCE_CORTEXA57
#define FORCE #define FORCE
#define ARCHITECTURE "ARM64" #define ARCHITECTURE "ARM64"
@ -927,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa57" #define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57" #define CORENAME "CORTEXA57"
#else #else
#endif #endif
#ifdef FORCE_VULCAN #ifdef FORCE_CORTEXA72
#define FORCE #define FORCE
#define ARCHITECTURE "ARM64" #define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VULCAN" #define SUBARCHITECTURE "CORTEXA72"
#define SUBDIRNAME "arm64" #define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVULCAN " \ #define ARCHCONFIG "-DCORTEXA72 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "vulcan" #define LIBNAME "cortexa72"
#define CORENAME "VULCAN" #define CORENAME "CORTEXA72"
#else
#endif
#ifdef FORCE_CORTEXA73
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA73"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA73 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa73"
#define CORENAME "CORTEXA73"
#else
#endif
#ifdef FORCE_FALKOR
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "FALKOR"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DFALKOR " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "falkor"
#define CORENAME "FALKOR"
#else #else
#endif #endif
@ -958,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DTHUNDERX " \ #define ARCHCONFIG "-DTHUNDERX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx" #define LIBNAME "thunderx"
#define CORENAME "THUNDERX" #define CORENAME "THUNDERX"
#else #else
#endif #endif
#ifdef FORCE_THUNDERX2T99 #ifdef FORCE_THUNDERX2T99
#define ARMV8
#define FORCE #define FORCE
#define ARCHITECTURE "ARM64" #define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX2T99" #define SUBARCHITECTURE "THUNDERX2T99"
@ -975,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx2t99" #define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99" #define CORENAME "THUNDERX2T99"
#else #else
@ -1003,6 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef FORCE #ifndef FORCE
#ifdef USER_TARGET
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt"
#endif
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
#ifndef POWER #ifndef POWER
@ -1074,7 +1143,7 @@ static int get_num_cores(void) {
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
SYSTEM_INFO sysinfo; SYSTEM_INFO sysinfo;
#elif defined(__FreeBSD__) || defined(__APPLE__) #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
int m[2], count; int m[2], count;
size_t len; size_t len;
#endif #endif
@ -1088,7 +1157,7 @@ static int get_num_cores(void) {
GetSystemInfo(&sysinfo); GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors; return sysinfo.dwNumberOfProcessors;
#elif defined(__FreeBSD__) || defined(__APPLE__) #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
m[0] = CTL_HW; m[0] = CTL_HW;
m[1] = HW_NCPU; m[1] = HW_NCPU;
len = sizeof(int); len = sizeof(int);
@ -1116,7 +1185,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE #ifdef FORCE
printf("CORE=%s\n", CORENAME); printf("CORE=%s\n", CORENAME);
#else #else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
printf("CORE=%s\n", get_corename()); printf("CORE=%s\n", get_corename());
#endif #endif
#endif #endif
@ -1181,9 +1250,7 @@ int main(int argc, char *argv[]){
#elif NO_PARALLEL_MAKE==1 #elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n"); printf("MAKE += -j 1\n");
#else #else
#ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores()); printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif #endif
break; break;
@ -1224,7 +1291,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE #ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else #else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif #endif
#endif #endif

View File

@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
CSBLAS1OBJS = \ CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
@ -277,7 +277,7 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(SUFFIX)
CDBLAS1OBJS = \ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
@ -294,7 +294,7 @@ CDBLAS3OBJS += \
cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(SUFFIX)
CCBLAS1OBJS = \ CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_ccopy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
@ -320,7 +320,7 @@ CCBLAS3OBJS = \
CZBLAS1OBJS = \ CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)

View File

@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (alpha == ZERO) return; if (alpha == ZERO) return;
if (incx == 0 && incy == 0) {
*y += n * alpha *(*x);
return;
}
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
@ -83,17 +88,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (incy < 0) y -= (n - 1) * incy; if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
if (incx == 0 || incy == 0) //
nthreads = 1;
//Temporarily work-around the low performance issue with small imput size & //Temporarily work-around the low performance issue with small imput size &
//multithreads. //multithreads.
if (n <= MULTI_THREAD_MINIMAL) if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m; if (trans) lenx = m;
if (trans) leny = n; if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return; if (alpha == ZERO) return;

View File

@ -44,6 +44,7 @@
#endif #endif
#ifndef COMPLEX #ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "QGEMM " #define ERROR_NAME "QGEMM "
#elif defined(DOUBLE) #elif defined(DOUBLE)
@ -52,6 +53,7 @@
#define ERROR_NAME "SGEMM " #define ERROR_NAME "SGEMM "
#endif #endif
#else #else
#define SMP_THRESHOLD_MIN 8192.0
#ifndef GEMM3M #ifndef GEMM3M
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "XGEMM " #define ERROR_NAME "XGEMM "
@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK; double MNK;
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
XFLOAT *sa, *sb; XFLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK; double MNK;
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
@ -273,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#ifndef COMPLEX #ifndef COMPLEX
args.alpha = (void *)&alpha; args.alpha = (void *)&alpha;
args.beta = (void *)&beta; args.beta = (void *)&beta;
@ -411,25 +417,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT);
nthreads_max = num_cpu_avail(3);
nthreads_avail = nthreads_max;
#ifndef COMPLEX
MNK = (double) args.m * (double) args.n * (double) args.k; MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1; args.nthreads = 1;
#else
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#endif
args.common = NULL;
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else else
args.nthreads = nthreads_max; args.nthreads = num_cpu_avail(3);
args.common = NULL;
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif

View File

@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m; if (trans) lenx = m;
if (trans) leny = n; if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return; if (alpha == ZERO) return;

View File

@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
blas_level1_thread(mode, n, k1, k2, dummyalpha, blas_level1_thread(mode, n, k1, k2, dummyalpha,
a, lda, NULL, 0, ipiv, incx, a, lda, NULL, 0, ipiv, incx,
laswp[flag], nthreads); (int(*)())laswp[flag], nthreads);
} }
#endif #endif

View File

@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
mode = BLAS_SINGLE | BLAS_COMPLEX; mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif #endif
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads);
} }
#endif #endif

View File

@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
long double s; long double s;
long double r, roe, z; long double r, roe, z;
long double ada = fabs(da); long double ada = fabsl(da);
long double adb = fabs(db); long double adb = fabsl(db);
long double scale = ada + adb; long double scale = ada + adb;
#ifndef CBLAS #ifndef CBLAS

View File

@ -64,6 +64,13 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp; FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
if (*dd2 == ZERO || dy1 == ZERO)
{
dflag = -TWO;
dparam[0] = dflag;
return;
}
if(*dd1 < ZERO) if(*dd1 < ZERO)
{ {
dflag = -ONE; dflag = -ONE;
@ -76,6 +83,16 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
*dd2 = ZERO; *dd2 = ZERO;
*dx1 = ZERO; *dx1 = ZERO;
} }
else if ((*dd1 == ZERO || *dx1 == ZERO) && *dd2 > ZERO)
{
dflag = ONE;
dh12 = 1;
dh21 = -1;
*dx1 = dy1;
dtemp = *dd1;
*dd1 = *dd2;
*dd2 = dtemp;
}
else else
{ {
dp2 = *dd2 * dy1; dp2 = *dd2 * dy1;
@ -90,6 +107,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dq1 = dp1 * *dx1; dq1 = dp1 * *dx1;
if(ABS(dq1) > ABS(dq2)) if(ABS(dq1) > ABS(dq2))
{ {
dflag = ZERO;
dh11 = ONE;
dh22 = ONE;
dh21 = - dy1 / *dx1; dh21 = - dy1 / *dx1;
dh12 = dp2 / dp1; dh12 = dp2 / dp1;
@ -100,8 +120,19 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
*dd1 = *dd1 / du; *dd1 = *dd1 / du;
*dd2 = *dd2 / du; *dd2 = *dd2 / du;
*dx1 = *dx1 * du; *dx1 = *dx1 * du;
} else {
dflag = -ONE;
dh11 = ZERO;
dh12 = ZERO;
dh21 = ZERO;
dh22 = ZERO;
*dd1 = ZERO;
*dd2 = ZERO;
*dx1 = ZERO;
} }
} }
else else
{ {
@ -120,7 +151,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
} }
else else
{ {
dflag = ONE; dflag = ONE;
dh21 = -ONE;
dh12 = ONE;
dh11 = dp1 / dp2; dh11 = dp1 / dp2;
dh22 = *dx1 / dy1; dh22 = *dx1 / dy1;
@ -134,74 +167,33 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
} }
if(*dd1 != ZERO) while ( *dd1 <= RGAMSQ && *dd1 != ZERO)
{ {
while( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) ) dflag = -ONE;
{ *dd1 = *dd1 * (GAM * GAM);
if(dflag == ZERO) *dx1 = *dx1 / GAM;
{ dh11 = dh11 / GAM;
dh11 = ONE; dh12 = dh12 / GAM;
dh22 = ONE; }
dflag = -ONE; while (ABS(*dd1) > GAMSQ) {
} dflag = -ONE;
else *dd1 = *dd1 / (GAM * GAM);
{ *dx1 = *dx1 * GAM;
if(dflag == ONE) dh11 = dh11 * GAM;
{ dh12 = dh12 * GAM;
dh21 = -ONE;
dh12 = ONE;
dflag = -ONE;
}
}
if( *dd1 <= RGAMSQ )
{
*dd1 = *dd1 * (GAM * GAM);
*dx1 = *dx1 / GAM;
dh11 = dh11 / GAM;
dh12 = dh12 / GAM;
}
else
{
*dd1 = *dd1 / (GAM * GAM);
*dx1 = *dx1 * GAM;
dh11 = dh11 * GAM;
dh12 = dh12 * GAM;
}
}
} }
if(*dd2 != ZERO) while (ABS(*dd2) <= RGAMSQ && *dd2 != ZERO) {
{ dflag = -ONE;
while( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) ) *dd2 = *dd2 * (GAM * GAM);
{ dh21 = dh21 / GAM;
if(dflag == ZERO) dh22 = dh22 / GAM;
{ }
dh11 = ONE; while (ABS(*dd2) > GAMSQ) {
dh22 = ONE; dflag = -ONE;
dflag = -ONE; *dd2 = *dd2 / (GAM * GAM);
} dh21 = dh21 * GAM;
else dh22 = dh22 * GAM;
{
if(dflag == ONE)
{
dh21 = -ONE;
dh12 = ONE;
dflag = -ONE;
}
}
if( ABS(*dd2) <= RGAMSQ )
{
*dd2 = *dd2 * (GAM * GAM);
dh21 = dh21 / GAM;
dh22 = dh22 / GAM;
}
else
{
*dd2 = *dd2 / (GAM * GAM);
dh21 = dh21 * GAM;
dh22 = dh22 * GAM;
}
}
} }
} }

View File

@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return; if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return; if (alpha == ZERO) return;

View File

@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
if (n <= 1048576 ) if (n <= 1048576 )
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return; if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return; if (alpha == ZERO) return;

View File

@ -42,7 +42,7 @@
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(THUNDERX2T99) || defined(VULCAN) #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
// Multithreaded swap gives performance benefits in ThunderX2T99 // Multithreaded swap gives performance benefits in ThunderX2T99
#else #else
// Disable multi-threading as it does not show any performance // Disable multi-threading as it does not show any performance

View File

@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
if (n == 0) return; if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return; if (alpha == ZERO) return;

View File

@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT);
args.nthreads = num_cpu_avail(3);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);
if (args.nthreads == 1) { if (args.nthreads == 1) {

View File

@ -41,7 +41,11 @@
#ifdef FUNCTION_PROFILE #ifdef FUNCTION_PROFILE
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000
#else
#define MULTI_THREAD_MINIMAL 10000
#endif
#ifndef CBLAS #ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
#endif #endif
#ifndef CBLAS #ifndef CBLAS
PRINT_DEBUG_CNAME; PRINT_DEBUG_NAME;
#else #else
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
#endif #endif
@ -78,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
if (incx == 0 && incy == 0) {
*y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) );
*(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) );
return;
}
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
@ -86,12 +96,15 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
if (incy < 0) y -= (n - 1) * incy * 2; if (incy < 0) y -= (n - 1) * incy * 2;
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
if (incx == 0 || incy == 0) //
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans & 1) lenx = m; if (trans & 1) lenx = m;
if (trans & 1) leny = n; if (trans & 1) leny = n;
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return; if (alpha_r == ZERO && alpha_i == ZERO) return;

View File

@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans & 1) lenx = m; if (trans & 1) lenx = m;
if (trans & 1) leny = n; if (trans & 1) leny = n;
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return; if (alpha_r == ZERO && alpha_i == ZERO) return;

View File

@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return; if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -43,6 +43,10 @@
#include "functable.h" #include "functable.h"
#endif #endif
// this is smallest dimension N of square input a to permit threading
// see graph in issue #1820 for explanation
#define MULTI_THREAD_MINIMAL 362
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "XHEMV " #define ERROR_NAME "XHEMV "
#elif defined(DOUBLE) #elif defined(DOUBLE)
@ -181,7 +185,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
if (n == 0) return; if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
@ -195,7 +199,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
buffer = (FLOAT *)blas_memory_alloc(1); buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(2); if (n<MULTI_THREAD_MINIMAL) {
nthreads = 1 ;
} else {
nthreads = num_cpu_avail(2);
};
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return; if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
long double db_i = *(DB + 1); long double db_i = *(DB + 1);
long double r; long double r;
long double ada = fabs(da_r) + fabs(da_i); long double ada = fabsl(da_r) + fabsl(da_i);
PRINT_DEBUG_NAME; PRINT_DEBUG_NAME;

View File

@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if (n == 0) return; if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
if ( n <= 1048576 ) if ( n <= 1048576 )
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -42,6 +42,14 @@
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
// Multithreaded swap gives performance benefits in ThunderX2T99
#else
// Disable multi-threading as it does not show any performance
// benefits. Keep the multi-threading code for the record.
#undef SMP
#endif
#ifndef CBLAS #ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@ -79,12 +87,12 @@ FLOAT *y = (FLOAT*)vy;
if (incy < 0) y -= (n - 1) * incy * 2; if (incy < 0) y -= (n - 1) * incy * 2;
#ifdef SMP #ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
if (incx == 0 || incy == 0) if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
nthreads = 1; nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else } else
nthreads = 1; nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) { if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40; buffer_size = n > 16 ? 0 : n * 4 + 40;
} }

View File

@ -121,14 +121,17 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.L3 # Makefile.L3
set(USE_TRMM false) set(USE_TRMM false)
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
set(USE_TRMM true) set(USE_TRMM true)
endif () endif ()
foreach (float_type ${FLOAT_TYPES}) foreach (float_type SINGLE DOUBLE)
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
endforeach()
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_char}GEMMINCOPY) if (${float_char}GEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
endif () endif ()

View File

@ -5,8 +5,43 @@ endif
TOPDIR = .. TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
AVX2OPT =
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
AVX2OPT = -mavx2
endif
endif
ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
AVX2OPT = -mavx2
endif
endif
ifdef NO_AVX2
AVX2OPT=
endif
ifdef TARGET_CORE ifdef TARGET_CORE
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(TARGET_CORE), SKYLAKEX)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
endif
else ifeq ($(TARGET_CORE), HASWELL)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif
BUILD_KERNEL = 1 BUILD_KERNEL = 1
KDIR = KDIR =
TSUFFIX = _$(TARGET_CORE) TSUFFIX = _$(TARGET_CORE)
@ -88,7 +123,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
$(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F)
setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
ifeq ($(USE_GEMM3M), 1)
$(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@
else
$(CC) -c $(CFLAGS) $< -o $@ $(CC) -c $(CFLAGS) $< -o $@
endif
setparam$(TSUFFIX).c : setparam-ref.c setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F) sed 's/TS/$(TSUFFIX)/g' $< > $(@F)

View File

@ -29,9 +29,11 @@ USE_TRMM = 1
endif endif
ifeq ($(CORE), HASWELL) ifeq ($(CORE), HASWELL)
ifeq ($(ARCH), x86_64)
USE_TRMM = 1 USE_TRMM = 1
endif endif
ifeq ($(CORE), SKYLAKEX)
USE_TRMM = 1
endif endif
ifeq ($(CORE), ZEN) ifeq ($(CORE), ZEN)
@ -42,7 +44,7 @@ ifeq ($(CORE), POWER8)
USE_TRMM = 1 USE_TRMM = 1
endif endif
ifeq ($(CORE), Z13) ifeq ($(ARCH), zarch)
USE_TRMM = 1 USE_TRMM = 1
endif endif

View File

@ -49,6 +49,7 @@ SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c
DSDOTKERNEL = ../generic/dot.c
SNRM2KERNEL = ../arm/nrm2.c SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c

View File

@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 } vldmia.f64 X!, { d4 - d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
fldmiad X!, { d6 - d7 } vldmia.f64 X!, { d6 - d7 }
vabs.f64 d6, d6 vabs.f64 d6, d6
vadd.f64 d1 , d1, d5 vadd.f64 d1 , d1, d5
vabs.f64 d7, d7 vabs.f64 d7, d7
@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmiad X!, { d4 } vldmia.f64 X!, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S4 .macro KERNEL_S4
fldmiad X, { d4 } vldmia.f64 X, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
add X, X, INC_X add X, X, INC_X
fldmiad X, { d4 } vldmia.f64 X, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
add X, X, INC_X add X, X, INC_X
fldmiad X, { d4 } vldmia.f64 X, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
add X, X, INC_X add X, X, INC_X
fldmiad X, { d4 } vldmia.f64 X, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
add X, X, INC_X add X, X, INC_X
@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmiad X, { d4 } vldmia.f64 X, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
add X, X, INC_X add X, X, INC_X
@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
fldmias X!, { s6 - s7 } vldmia.f32 X!, { s6 - s7 }
vabs.f32 s6, s6 vabs.f32 s6, s6
vadd.f32 s1 , s1, s5 vadd.f32 s1 , s1, s5
vabs.f32 s7, s7 vabs.f32 s7, s7
@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmias X!, { s4 } vldmia.f32 X!, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S4 .macro KERNEL_S4
fldmias X, { s4 } vldmia.f32 X, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
add X, X, INC_X add X, X, INC_X
fldmias X, { s4 } vldmia.f32 X, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
add X, X, INC_X add X, X, INC_X
fldmias X, { s4 } vldmia.f32 X, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
add X, X, INC_X add X, X, INC_X
fldmias X, { s4 } vldmia.f32 X, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
add X, X, INC_X add X, X, INC_X
@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmias X, { s4 } vldmia.f32 X, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
add X, X, INC_X add X, X, INC_X
@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 } vldmia.f64 X!, { d4 - d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
fldmiad X!, { d6 - d7 } vldmia.f64 X!, { d6 - d7 }
vabs.f64 d6, d6 vabs.f64 d6, d6
vadd.f64 d1 , d1, d5 vadd.f64 d1 , d1, d5
vabs.f64 d7, d7 vabs.f64 d7, d7
@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vadd.f64 d1 , d1, d7 vadd.f64 d1 , d1, d7
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 } vldmia.f64 X!, { d4 - d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
fldmiad X!, { d6 - d7 } vldmia.f64 X!, { d6 - d7 }
vabs.f64 d6, d6 vabs.f64 d6, d6
vadd.f64 d1 , d1, d5 vadd.f64 d1 , d1, d5
vabs.f64 d7, d7 vabs.f64 d7, d7
@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmiad X!, { d4 } vldmia.f64 X!, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
fldmiad X!, { d4 } vldmia.f64 X!, { d4 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S4 .macro KERNEL_S4
fldmiad X, { d4 -d5 } vldmia.f64 X, { d4 -d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
vadd.f64 d0 , d0, d5 vadd.f64 d0 , d0, d5
add X, X, INC_X add X, X, INC_X
fldmiad X, { d4 -d5 } vldmia.f64 X, { d4 -d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
vadd.f64 d0 , d0, d5 vadd.f64 d0 , d0, d5
add X, X, INC_X add X, X, INC_X
fldmiad X, { d4 -d5 } vldmia.f64 X, { d4 -d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
vadd.f64 d0 , d0, d5 vadd.f64 d0 , d0, d5
add X, X, INC_X add X, X, INC_X
fldmiad X, { d4 -d5 } vldmia.f64 X, { d4 -d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmiad X, { d4 -d5 } vldmia.f64 X, { d4 -d5 }
vabs.f64 d4, d4 vabs.f64 d4, d4
vadd.f64 d0 , d0, d4 vadd.f64 d0 , d0, d4
vabs.f64 d5, d5 vabs.f64 d5, d5
@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
fldmias X!, { s6 - s7 } vldmia.f32 X!, { s6 - s7 }
vabs.f32 s6, s6 vabs.f32 s6, s6
vadd.f32 s1 , s1, s5 vadd.f32 s1 , s1, s5
vabs.f32 s7, s7 vabs.f32 s7, s7
vadd.f32 s0 , s0, s6 vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7 vadd.f32 s1 , s1, s7
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
fldmias X!, { s6 - s7 } vldmia.f32 X!, { s6 - s7 }
vabs.f32 s6, s6 vabs.f32 s6, s6
vadd.f32 s1 , s1, s5 vadd.f32 s1 , s1, s5
vabs.f32 s7, s7 vabs.f32 s7, s7
@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmias X!, { s4 } vldmia.f32 X!, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
fldmias X!, { s4 } vldmia.f32 X!, { s4 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S4 .macro KERNEL_S4
fldmias X, { s4 -s5 } vldmia.f32 X, { s4 -s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
vadd.f32 s0 , s0, s5 vadd.f32 s0 , s0, s5
add X, X, INC_X add X, X, INC_X
fldmias X, { s4 -s5 } vldmia.f32 X, { s4 -s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
vadd.f32 s0 , s0, s5 vadd.f32 s0 , s0, s5
add X, X, INC_X add X, X, INC_X
fldmias X, { s4 -s5 } vldmia.f32 X, { s4 -s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
vadd.f32 s0 , s0, s5 vadd.f32 s0 , s0, s5
add X, X, INC_X add X, X, INC_X
fldmias X, { s4 -s5 } vldmia.f32 X, { s4 -s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5
@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmias X, { s4 -s5 } vldmia.f32 X, { s4 -s5 }
vabs.f32 s4, s4 vabs.f32 s4, s4
vadd.f32 s0 , s0, s4 vadd.f32 s0 , s0, s4
vabs.f32 s5, s5 vabs.f32 s5, s5

View File

@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 } vldmia.f64 X!, { d4 - d7 }
pld [ Y, #X_PRE ] pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 } vldmia.f64 Y , { d8 - d11 }
fmacd d8 , d0, d4 fmacd d8 , d0, d4
fstmiad Y!, { d8 } vstmia.f64 Y!, { d8 }
fmacd d9 , d0, d5 fmacd d9 , d0, d5
fstmiad Y!, { d9 } vstmia.f64 Y!, { d9 }
fmacd d10, d0, d6 fmacd d10, d0, d6
fstmiad Y!, { d10 } vstmia.f64 Y!, { d10 }
fmacd d11, d0, d7 fmacd d11, d0, d7
fstmiad Y!, { d11 } vstmia.f64 Y!, { d11 }
.endm .endm
@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmiad X!, { d4 } vldmia.f64 X!, { d4 }
fldmiad Y , { d8 } vldmia.f64 Y , { d8 }
fmacd d8 , d0, d4 fmacd d8 , d0, d4
fstmiad Y!, { d8 } vstmia.f64 Y!, { d8 }
.endm .endm
.macro KERNEL_S1 .macro KERNEL_S1
fldmiad X , { d4 } vldmia.f64 X , { d4 }
fldmiad Y , { d8 } vldmia.f64 Y , { d8 }
fmacd d8 , d0, d4 fmacd d8 , d0, d4
fstmiad Y , { d8 } vstmia.f64 Y , { d8 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
fldmias X!, { s4 - s7 } vldmia.f32 X!, { s4 - s7 }
fldmias Y , { s8 - s11 } vldmia.f32 Y , { s8 - s11 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fstmias Y!, { s8 } vstmia.f32 Y!, { s8 }
fmacs s9 , s0, s5 fmacs s9 , s0, s5
fstmias Y!, { s9 } vstmia.f32 Y!, { s9 }
fmacs s10, s0, s6 fmacs s10, s0, s6
fstmias Y!, { s10 } vstmia.f32 Y!, { s10 }
fmacs s11, s0, s7 fmacs s11, s0, s7
fstmias Y!, { s11 } vstmia.f32 Y!, { s11 }
.endm .endm
@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmias X!, { s4 } vldmia.f32 X!, { s4 }
fldmias Y , { s8 } vldmia.f32 Y , { s8 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fstmias Y!, { s8 } vstmia.f32 Y!, { s8 }
.endm .endm
.macro KERNEL_S1 .macro KERNEL_S1
fldmias X , { s4 } vldmia.f32 X , { s4 }
fldmias Y , { s8 } vldmia.f32 Y , { s8 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fstmias Y , { s8 } vstmia.f32 Y , { s8 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 } vldmia.f64 X!, { d4 - d7 }
pld [ Y, #X_PRE ] pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 } vldmia.f64 Y , { d8 - d11 }
FMAC_R1 d8 , d0, d4 FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5 FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5 FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4 FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 } vstmia.f64 Y!, { d8 }
fstmiad Y!, { d9 } vstmia.f64 Y!, { d9 }
FMAC_R1 d10, d0, d6 FMAC_R1 d10, d0, d6
FMAC_R2 d10, d1, d7 FMAC_R2 d10, d1, d7
FMAC_I1 d11, d0, d7 FMAC_I1 d11, d0, d7
FMAC_I2 d11, d1, d6 FMAC_I2 d11, d1, d6
fstmiad Y!, { d10 } vstmia.f64 Y!, { d10 }
fstmiad Y!, { d11 } vstmia.f64 Y!, { d11 }
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 } vldmia.f64 X!, { d4 - d7 }
pld [ Y, #X_PRE ] pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 } vldmia.f64 Y , { d8 - d11 }
FMAC_R1 d8 , d0, d4 FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5 FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5 FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4 FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 } vstmia.f64 Y!, { d8 }
fstmiad Y!, { d9 } vstmia.f64 Y!, { d9 }
FMAC_R1 d10, d0, d6 FMAC_R1 d10, d0, d6
FMAC_R2 d10, d1, d7 FMAC_R2 d10, d1, d7
FMAC_I1 d11, d0, d7 FMAC_I1 d11, d0, d7
FMAC_I2 d11, d1, d6 FMAC_I2 d11, d1, d6
fstmiad Y!, { d10 } vstmia.f64 Y!, { d10 }
fstmiad Y!, { d11 } vstmia.f64 Y!, { d11 }
@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmiad X!, { d4 - d5 } vldmia.f64 X!, { d4 - d5 }
fldmiad Y , { d8 - d9 } vldmia.f64 Y , { d8 - d9 }
FMAC_R1 d8 , d0, d4 FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5 FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5 FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4 FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 } vstmia.f64 Y!, { d8 }
fstmiad Y!, { d9 } vstmia.f64 Y!, { d9 }
@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmiad X , { d4 - d5 } vldmia.f64 X , { d4 - d5 }
fldmiad Y , { d8 - d9 } vldmia.f64 Y , { d8 - d9 }
FMAC_R1 d8 , d0, d4 FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5 FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5 FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4 FMAC_I2 d9 , d1, d4
fstmiad Y , { d8 - d9 } vstmia.f64 Y , { d8 - d9 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4 .macro KERNEL_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmias X!, { s4 - s7 } vldmia.f32 X!, { s4 - s7 }
pld [ Y, #X_PRE ] pld [ Y, #X_PRE ]
fldmias Y , { s8 - s11 } vldmia.f32 Y , { s8 - s11 }
FMAC_R1 s8 , s0, s4 FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5 FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5 FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4 FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 } vstmia.f32 Y!, { s8 }
fstmias Y!, { s9 } vstmia.f32 Y!, { s9 }
FMAC_R1 s10, s0, s6 FMAC_R1 s10, s0, s6
FMAC_R2 s10, s1, s7 FMAC_R2 s10, s1, s7
FMAC_I1 s11, s0, s7 FMAC_I1 s11, s0, s7
FMAC_I2 s11, s1, s6 FMAC_I2 s11, s1, s6
fstmias Y!, { s10 } vstmia.f32 Y!, { s10 }
fstmias Y!, { s11 } vstmia.f32 Y!, { s11 }
fldmias X!, { s4 - s7 } vldmia.f32 X!, { s4 - s7 }
fldmias Y , { s8 - s11 } vldmia.f32 Y , { s8 - s11 }
FMAC_R1 s8 , s0, s4 FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5 FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5 FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4 FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 } vstmia.f32 Y!, { s8 }
fstmias Y!, { s9 } vstmia.f32 Y!, { s9 }
FMAC_R1 s10, s0, s6 FMAC_R1 s10, s0, s6
FMAC_R2 s10, s1, s7 FMAC_R2 s10, s1, s7
FMAC_I1 s11, s0, s7 FMAC_I1 s11, s0, s7
FMAC_I2 s11, s1, s6 FMAC_I2 s11, s1, s6
fstmias Y!, { s10 } vstmia.f32 Y!, { s10 }
fstmias Y!, { s11 } vstmia.f32 Y!, { s11 }
@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
fldmias Y , { s8 - s9 } vldmia.f32 Y , { s8 - s9 }
FMAC_R1 s8 , s0, s4 FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5 FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5 FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4 FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 } vstmia.f32 Y!, { s8 }
fstmias Y!, { s9 } vstmia.f32 Y!, { s9 }
@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmias X , { s4 - s5 } vldmia.f32 X , { s4 - s5 }
fldmias Y , { s8 - s9 } vldmia.f32 Y , { s8 - s9 }
FMAC_R1 s8 , s0, s4 FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5 FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5 FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4 FMAC_I2 s9 , s1, s4
fstmias Y , { s8 - s9 } vstmia.f32 Y , { s8 - s9 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble axpy_kernel_L999 ble axpy_kernel_L999
/*
cmp INC_X, #0 cmp INC_X, #0
beq axpy_kernel_L999 beq axpy_kernel_L999
cmp INC_Y, #0 cmp INC_Y, #0
beq axpy_kernel_L999 beq axpy_kernel_L999
*/
cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN bne axpy_kernel_S_BEGIN

View File

@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY_F4 .macro COPY_F4
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
fldmias X!, { s0 - s7 } vldmia.f32 X!, { s0 - s7 }
fstmias Y!, { s0 - s7 } vstmia.f32 Y!, { s0 - s7 }
.endm .endm
.macro COPY_F1 .macro COPY_F1
fldmias X!, { s0 - s1 } vldmia.f32 X!, { s0 - s1 }
fstmias Y!, { s0 - s1 } vstmia.f32 Y!, { s0 - s1 }
.endm .endm
@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY_S4 .macro COPY_S4
nop nop
fldmias X, { s0 - s1 } vldmia.f32 X, { s0 - s1 }
fstmias Y, { s0 - s1 } vstmia.f32 Y, { s0 - s1 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
fldmias X, { s2 - s3 } vldmia.f32 X, { s2 - s3 }
fstmias Y, { s2 - s3 } vstmia.f32 Y, { s2 - s3 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
fldmias X, { s0 - s1 } vldmia.f32 X, { s0 - s1 }
fstmias Y, { s0 - s1 } vstmia.f32 Y, { s0 - s1 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
fldmias X, { s2 - s3 } vldmia.f32 X, { s2 - s3 }
fstmias Y, { s2 - s3 } vstmia.f32 Y, { s2 - s3 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY_S1 .macro COPY_S1
fldmias X, { s0 - s1 } vldmia.f32 X, { s0 - s1 }
fstmias Y, { s0 - s1 } vstmia.f32 Y, { s0 - s1 }
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y

View File

@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ X, #X_PRE ] pld [ X, #X_PRE ]
pld [ Y, #X_PRE ] pld [ Y, #X_PRE ]
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
fldmias Y!, { s8 - s9 } vldmia.f32 Y!, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fldmias X!, { s6 - s7 } vldmia.f32 X!, { s6 - s7 }
fmacs s2 , s5, s9 fmacs s2 , s5, s9
fmacs s3 , s5, s8 fmacs s3 , s5, s8
fldmias Y!, { s10 - s11 } vldmia.f32 Y!, { s10 - s11 }
fmacs s0 , s6, s10 fmacs s0 , s6, s10
fmacs s1 , s6, s11 fmacs s1 , s6, s11
fmacs s2 , s7, s11 fmacs s2 , s7, s11
fmacs s3 , s7, s10 fmacs s3 , s7, s10
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
fldmias Y!, { s8 - s9 } vldmia.f32 Y!, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fldmias X!, { s6 - s7 } vldmia.f32 X!, { s6 - s7 }
fmacs s2 , s5, s9 fmacs s2 , s5, s9
fmacs s3 , s5, s8 fmacs s3 , s5, s8
fldmias Y!, { s10 - s11 } vldmia.f32 Y!, { s10 - s11 }
fmacs s0 , s6, s10 fmacs s0 , s6, s10
fmacs s1 , s6, s11 fmacs s1 , s6, s11
fmacs s2 , s7, s11 fmacs s2 , s7, s11
@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1 .macro KERNEL_F1
fldmias X!, { s4 - s5 } vldmia.f32 X!, { s4 - s5 }
fldmias Y!, { s8 - s9 } vldmia.f32 Y!, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fmacs s2 , s5, s9 fmacs s2 , s5, s9
@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
nop nop
fldmias X, { s4 - s5 } vldmia.f32 X, { s4 - s5 }
fldmias Y, { s8 - s9 } vldmia.f32 Y, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fmacs s2 , s5, s9 fmacs s2 , s5, s9
@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
fldmias X, { s4 - s5 } vldmia.f32 X, { s4 - s5 }
fldmias Y, { s8 - s9 } vldmia.f32 Y, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fmacs s2 , s5, s9 fmacs s2 , s5, s9
@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
fldmias X, { s4 - s5 } vldmia.f32 X, { s4 - s5 }
fldmias Y, { s8 - s9 } vldmia.f32 Y, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fmacs s2 , s5, s9 fmacs s2 , s5, s9
@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add X, X, INC_X add X, X, INC_X
add Y, Y, INC_Y add Y, Y, INC_Y
fldmias X, { s4 - s5 } vldmia.f32 X, { s4 - s5 }
fldmias Y, { s8 - s9 } vldmia.f32 Y, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fmacs s2 , s5, s9 fmacs s2 , s5, s9
@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1 .macro KERNEL_S1
fldmias X, { s4 - s5 } vldmia.f32 X, { s4 - s5 }
fldmias Y, { s8 - s9 } vldmia.f32 Y, { s8 - s9 }
fmacs s0 , s4, s8 fmacs s0 , s4, s8
fmacs s1 , s4, s9 fmacs s1 , s4, s9
fmacs s2 , s5, s9 fmacs s2 , s5, s9
@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble cdot_kernel_L999 ble cdot_kernel_L999
cmp INC_X, #0 # cmp INC_X, #0
beq cdot_kernel_L999 # beq cdot_kernel_L999
cmp INC_Y, #0 # cmp INC_Y, #0
beq cdot_kernel_L999 # beq cdot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne cdot_kernel_S_BEGIN bne cdot_kernel_S_BEGIN

View File

@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_I .macro KERNEL2x2_I
pld [ AO, #A_PRE ] pld [ AO, #A_PRE ]
fldmias AO!, { s0 - s3 } vldmia.f32 AO!, { s0 - s3 }
pld [ BO, #B_PRE ] pld [ BO, #B_PRE ]
fldmias BO!, { s4 - s7 } vldmia.f32 BO!, { s4 - s7 }
fmuls s8 , s0, s4 fmuls s8 , s0, s4
@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_M1 .macro KERNEL2x2_M1
pld [ AO, #A_PRE ] pld [ AO, #A_PRE ]
fldmias AO!, { s0 - s3 } vldmia.f32 AO!, { s0 - s3 }
pld [ BO, #B_PRE ] pld [ BO, #B_PRE ]
fldmias BO!, { s4 - s7 } vldmia.f32 BO!, { s4 - s7 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fmacs s9 , s0, s5 fmacs s9 , s0, s5
@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_M2 .macro KERNEL2x2_M2
fldmias AO!, { s0 - s3 } vldmia.f32 AO!, { s0 - s3 }
fldmias BO!, { s4 - s7 } vldmia.f32 BO!, { s4 - s7 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fmacs s9 , s0, s5 fmacs s9 , s0, s5
@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_E .macro KERNEL2x2_E
fldmias AO!, { s0 - s3 } vldmia.f32 AO!, { s0 - s3 }
fldmias BO!, { s4 - s7 } vldmia.f32 BO!, { s4 - s7 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fmacs s9 , s0, s5 fmacs s9 , s0, s5
@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_SUB .macro KERNEL2x2_SUB
fldmias AO!, { s0 - s3 } vldmia.f32 AO!, { s0 - s3 }
fldmias BO!, { s4 - s7 } vldmia.f32 BO!, { s4 - s7 }
fmacs s8 , s0, s4 fmacs s8 , s0, s4
fmacs s9 , s0, s5 fmacs s9 , s0, s5
@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s7 } vldmia.f32 CO1, { s4 - s7 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s6 , s1 , s11 FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10 FMAC_I2 s7 , s1 , s10
fstmias CO1, { s4 - s7 } vstmia.f32 CO1, { s4 - s7 }
fldmias CO2, { s4 - s7 } vldmia.f32 CO2, { s4 - s7 }
FMAC_R1 s4 , s0 , s12 FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13 FMAC_I1 s5 , s0 , s13
@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s6 , s1 , s15 FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14 FMAC_I2 s7 , s1 , s14
fstmias CO2, { s4 - s7 } vstmia.f32 CO2, { s4 - s7 }
add CO1, CO1, #16 add CO1, CO1, #16
@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s5 } vldmia.f32 CO1, { s4 - s5 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9 FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8 FMAC_I2 s5 , s1 , s8
fstmias CO1, { s4 - s5 } vstmia.f32 CO1, { s4 - s5 }
fldmias CO2, { s4 - s5 } vldmia.f32 CO2, { s4 - s5 }
FMAC_R1 s4 , s0 , s12 FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13 FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13 FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12 FMAC_I2 s5 , s1 , s12
fstmias CO2, { s4 - s5 } vstmia.f32 CO2, { s4 - s5 }
add CO1, CO1, #8 add CO1, CO1, #8
@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s7 } vldmia.f32 CO1, { s4 - s7 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s6 , s1 , s11 FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10 FMAC_I2 s7 , s1 , s10
fstmias CO1, { s4 - s7 } vstmia.f32 CO1, { s4 - s7 }
add CO1, CO1, #16 add CO1, CO1, #16
@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s5 } vldmia.f32 CO1, { s4 - s5 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9 FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8 FMAC_I2 s5 , s1 , s8
fstmias CO1, { s4 - s5 } vstmia.f32 CO1, { s4 - s5 }
add CO1, CO1, #8 add CO1, CO1, #8

View File

@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_I .macro KERNEL2x2_I
pld [ AO , #A_PRE ] pld [ AO , #A_PRE ]
pld [ BO , #B_PRE ] pld [ BO , #B_PRE ]
fldmias AO!, { s0 - s1 } vldmia.f32 AO!, { s0 - s1 }
fldmias BO!, { s8 - s9 } vldmia.f32 BO!, { s8 - s9 }
fmuls s16 , s0, s8 fmuls s16 , s0, s8
fmuls s24 , s1, s9 fmuls s24 , s1, s9
fldmias AO!, { s2 - s3 } vldmia.f32 AO!, { s2 - s3 }
fmuls s17 , s0, s9 fmuls s17 , s0, s9
fmuls s25 , s1, s8 fmuls s25 , s1, s8
fldmias BO!, { s10 - s11 } vldmia.f32 BO!, { s10 - s11 }
fmuls s18 , s2, s8 fmuls s18 , s2, s8
fmuls s26 , s3, s9 fmuls s26 , s3, s9
fldmias AO!, { s4 - s5 } vldmia.f32 AO!, { s4 - s5 }
fmuls s19 , s2, s9 fmuls s19 , s2, s9
fmuls s27 , s3, s8 fmuls s27 , s3, s8
fldmias BO!, { s12 - s13 } vldmia.f32 BO!, { s12 - s13 }
fmuls s20 , s0, s10 fmuls s20 , s0, s10
fmuls s28 , s1, s11 fmuls s28 , s1, s11
fldmias AO!, { s6 - s7 } vldmia.f32 AO!, { s6 - s7 }
fmuls s21 , s0, s11 fmuls s21 , s0, s11
fmuls s29 , s1, s10 fmuls s29 , s1, s10
fldmias BO!, { s14 - s15 } vldmia.f32 BO!, { s14 - s15 }
fmuls s22 , s2, s10 fmuls s22 , s2, s10
fmuls s30 , s3, s11 fmuls s30 , s3, s11
fmuls s23 , s2, s11 fmuls s23 , s2, s11
@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_M1 .macro KERNEL2x2_M1
fmacs s16 , s0, s8 fmacs s16 , s0, s8
fldmias AO!, { s4 - s5 } vldmia.f32 AO!, { s4 - s5 }
fmacs s24 , s1, s9 fmacs s24 , s1, s9
fmacs s17 , s0, s9 fmacs s17 , s0, s9
fldmias BO!, { s12 - s13 } vldmia.f32 BO!, { s12 - s13 }
fmacs s25 , s1, s8 fmacs s25 , s1, s8
fmacs s18 , s2, s8 fmacs s18 , s2, s8
fldmias AO!, { s6 - s7 } vldmia.f32 AO!, { s6 - s7 }
fmacs s26 , s3, s9 fmacs s26 , s3, s9
fmacs s19 , s2, s9 fmacs s19 , s2, s9
fldmias BO!, { s14 - s15 } vldmia.f32 BO!, { s14 - s15 }
fmacs s27 , s3, s8 fmacs s27 , s3, s8
fmacs s20 , s0, s10 fmacs s20 , s0, s10
@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ BO , #B_PRE ] pld [ BO , #B_PRE ]
fmacs s24 , s5, s13 fmacs s24 , s5, s13
fmacs s17 , s4, s13 fmacs s17 , s4, s13
fldmias AO!, { s0 - s1 } vldmia.f32 AO!, { s0 - s1 }
fmacs s25 , s5, s12 fmacs s25 , s5, s12
fmacs s18 , s6, s12 fmacs s18 , s6, s12
fmacs s26 , s7, s13 fmacs s26 , s7, s13
fldmias BO!, { s8 - s9 } vldmia.f32 BO!, { s8 - s9 }
fmacs s19 , s6, s13 fmacs s19 , s6, s13
fmacs s27 , s7, s12 fmacs s27 , s7, s12
fldmias AO!, { s2 - s3 } vldmia.f32 AO!, { s2 - s3 }
fmacs s20 , s4, s14 fmacs s20 , s4, s14
fmacs s28 , s5, s15 fmacs s28 , s5, s15
fldmias BO!, { s10 - s11 } vldmia.f32 BO!, { s10 - s11 }
fmacs s21 , s4, s15 fmacs s21 , s4, s15
fmacs s29 , s5, s14 fmacs s29 , s5, s14
@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_SUB .macro KERNEL2x2_SUB
fldmias AO!, { s0 - s1 } vldmia.f32 AO!, { s0 - s1 }
fldmias BO!, { s8 - s9 } vldmia.f32 BO!, { s8 - s9 }
fmacs s16 , s0, s8 fmacs s16 , s0, s8
fmacs s24 , s1, s9 fmacs s24 , s1, s9
fldmias AO!, { s2 - s3 } vldmia.f32 AO!, { s2 - s3 }
fmacs s17 , s0, s9 fmacs s17 , s0, s9
fmacs s25 , s1, s8 fmacs s25 , s1, s8
fldmias BO!, { s10 - s11 } vldmia.f32 BO!, { s10 - s11 }
fmacs s18 , s2, s8 fmacs s18 , s2, s8
fmacs s26 , s3, s9 fmacs s26 , s3, s9
fmacs s19 , s2, s9 fmacs s19 , s2, s9
@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s7 } vldmia.f32 CO1, { s4 - s7 }
fldmias CO2, { s8 - s11 } vldmia.f32 CO2, { s8 - s11 }
FADD_R s16, s24 , s16 FADD_R s16, s24 , s16
FADD_I s17, s25 , s17 FADD_I s17, s25 , s17
@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s10, s1 , s23 FMAC_R2 s10, s1 , s23
FMAC_I2 s11, s1 , s22 FMAC_I2 s11, s1 , s22
fstmias CO1, { s4 - s7 } vstmia.f32 CO1, { s4 - s7 }
fstmias CO2, { s8 - s11 } vstmia.f32 CO2, { s8 - s11 }
add CO1, CO1, #16 add CO1, CO1, #16
@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s5 } vldmia.f32 CO1, { s4 - s5 }
fldmias CO2, { s8 - s9 } vldmia.f32 CO2, { s8 - s9 }
FADD_R s16, s24 , s16 FADD_R s16, s24 , s16
FADD_I s17, s25 , s17 FADD_I s17, s25 , s17
@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s8 , s1 , s21 FMAC_R2 s8 , s1 , s21
FMAC_I2 s9 , s1 , s20 FMAC_I2 s9 , s1 , s20
fstmias CO1, { s4 - s5 } vstmia.f32 CO1, { s4 - s5 }
fstmias CO2, { s8 - s9 } vstmia.f32 CO2, { s8 - s9 }
add CO1, CO1, #8 add CO1, CO1, #8
@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s7 } vldmia.f32 CO1, { s4 - s7 }
FADD_R s16, s24 , s16 FADD_R s16, s24 , s16
FADD_I s17, s25 , s17 FADD_I s17, s25 , s17
@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s6 , s1 , s19 FMAC_R2 s6 , s1 , s19
FMAC_I2 s7 , s1 , s18 FMAC_I2 s7 , s1 , s18
fstmias CO1, { s4 - s7 } vstmia.f32 CO1, { s4 - s7 }
add CO1, CO1, #16 add CO1, CO1, #16
@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias CO1, { s4 - s5 } vldmia.f32 CO1, { s4 - s5 }
FADD_R s16, s24 , s16 FADD_R s16, s24 , s16
FADD_I s17, s25 , s17 FADD_I s17, s25 , s17
@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s4 , s1 , s17 FMAC_R2 s4 , s1 , s17
FMAC_I2 s5 , s1 , s16 FMAC_I2 s5 , s1 , s16
fstmias CO1, { s4 - s5 } vstmia.f32 CO1, { s4 - s5 }
add CO1, CO1, #8 add CO1, CO1, #8

View File

@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s6 , [ AO2, #8 ] flds s6 , [ AO2, #8 ]
flds s7 , [ AO2, #12 ] flds s7 , [ AO2, #12 ]
fstmias BO!, { s0 - s7 } vstmia.f32 BO!, { s0 - s7 }
add AO2, AO2, #16 add AO2, AO2, #16
.endm .endm
@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s3 , [ AO2, #4 ] flds s3 , [ AO2, #4 ]
add AO1, AO1, #8 add AO1, AO1, #8
fstmias BO!, { s0 - s3 } vstmia.f32 BO!, { s0 - s3 }
add AO2, AO2, #8 add AO2, AO2, #8
.endm .endm
@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s2 , [ AO1, #8 ] flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ] flds s3 , [ AO1, #12 ]
fstmias BO!, { s0 - s3 } vstmia.f32 BO!, { s0 - s3 }
add AO1, AO1, #16 add AO1, AO1, #16
.endm .endm
@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0 , [ AO1, #0 ] flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ] flds s1 , [ AO1, #4 ]
fstmias BO!, { s0 - s1 } vstmia.f32 BO!, { s0 - s1 }
add AO1, AO1, #8 add AO1, AO1, #8
.endm .endm

View File

@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
.macro COPY2x2 .macro COPY2x2
fldmias AO1, { s0 - s3 } vldmia.f32 AO1, { s0 - s3 }
add r3, AO1, LDA add r3, AO1, LDA
fldmias r3, { s4 - s7 } vldmia.f32 r3, { s4 - s7 }
fstmias BO1, { s0 - s7 } vstmia.f32 BO1, { s0 - s7 }
add AO1, AO1, #16 add AO1, AO1, #16
add BO1, BO1, M4 add BO1, BO1, M4
@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY1x2 .macro COPY1x2
fldmias AO1, { s0 -s1 } vldmia.f32 AO1, { s0 -s1 }
add r3, AO1, LDA add r3, AO1, LDA
fldmias r3, { s2 - s3 } vldmia.f32 r3, { s2 - s3 }
fstmias BO2, { s0 - s3 } vstmia.f32 BO2, { s0 - s3 }
add AO1, AO1, #8 add AO1, AO1, #8
add BO2, BO2, #16 add BO2, BO2, #16
@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*************************************************************************************************************************/ /*************************************************************************************************************************/
.macro COPY2x1 .macro COPY2x1
fldmias AO1, { s0 - s3 } vldmia.f32 AO1, { s0 - s3 }
fstmias BO1, { s0 - s3 } vstmia.f32 BO1, { s0 - s3 }
add AO1, AO1, #16 add AO1, AO1, #16
add BO1, BO1, M4 add BO1, BO1, M4
@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY1x1 .macro COPY1x1
fldmias AO1, { s0 - s1 } vldmia.f32 AO1, { s0 - s1 }
fstmias BO2, { s0 - s1 } vstmia.f32 BO2, { s0 - s1 }
add AO1, AO1, #8 add AO1, AO1, #8
add BO2, BO2, #8 add BO2, BO2, #8

View File

@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias YO, { s4 - s7 } vldmia.f32 YO, { s4 - s7 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s6 , s1 , s11 FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10 FMAC_I2 s7 , s1 , s10
fstmias YO!, { s4 - s7 } vstmia.f32 YO!, { s4 - s7 }
fldmias YO, { s4 - s7 } vldmia.f32 YO, { s4 - s7 }
FMAC_R1 s4 , s0 , s12 FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13 FMAC_I1 s5 , s0 , s13
@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FMAC_R2 s6 , s1 , s15 FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14 FMAC_I2 s7 , s1 , s14
fstmias YO!, { s4 - s7 } vstmia.f32 YO!, { s4 - s7 }
.endm .endm
@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias YO, { s4 - s5 } vldmia.f32 YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9 FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8 FMAC_I2 s5 , s1 , s8
fstmias YO, { s4 - s5 } vstmia.f32 YO, { s4 - s5 }
add YO, YO, #8 add YO, YO, #8
@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias YO, { s4 - s5 } vldmia.f32 YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9 FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8 FMAC_I2 s5 , s1 , s8
fstmias YO, { s4 - s5 } vstmia.f32 YO, { s4 - s5 }
add YO, YO, INC_Y add YO, YO, INC_Y
fldmias YO, { s6 - s7 } vldmia.f32 YO, { s6 - s7 }
FMAC_R1 s6 , s0 , s10 FMAC_R1 s6 , s0 , s10
FMAC_I1 s7 , s0 , s11 FMAC_I1 s7 , s0 , s11
FMAC_R2 s6 , s1 , s11 FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10 FMAC_I2 s7 , s1 , s10
fstmias YO, { s6 - s7 } vstmia.f32 YO, { s6 - s7 }
add YO, YO, INC_Y add YO, YO, INC_Y
fldmias YO, { s4 - s5 } vldmia.f32 YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s12 FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13 FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13 FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12 FMAC_I2 s5 , s1 , s12
fstmias YO, { s4 - s5 } vstmia.f32 YO, { s4 - s5 }
add YO, YO, INC_Y add YO, YO, INC_Y
fldmias YO, { s6 - s7 } vldmia.f32 YO, { s6 - s7 }
FMAC_R1 s6 , s0 , s14 FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15 FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15 FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14 FMAC_I2 s7 , s1 , s14
fstmias YO, { s6 - s7 } vstmia.f32 YO, { s6 - s7 }
add YO, YO, INC_Y add YO, YO, INC_Y
@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R flds s0, ALPHA_R
flds s1, ALPHA_I flds s1, ALPHA_I
fldmias YO, { s4 - s5 } vldmia.f32 YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s8 FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9 FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9 FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8 FMAC_I2 s5 , s1 , s8
fstmias YO, { s4 - s5 } vstmia.f32 YO, { s4 - s5 }
add YO, YO, INC_Y add YO, YO, INC_Y

Some files were not shown because too many files have changed in this diff Show More