Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop

This commit is contained in:
AbdelRauf 2019-04-29 08:57:44 +00:00
commit 628b335e83
197 changed files with 17904 additions and 7444 deletions

View File

@ -149,7 +149,7 @@ matrix:
- &test-macos - &test-macos
os: osx os: osx
osx_image: xcode8.3 osx_image: xcode10.1
before_script: before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update - brew update
@ -160,6 +160,7 @@ matrix:
- BTYPE="BINARY=64 INTERFACE64=1" - BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos - <<: *test-macos
osx_image: xcode8.3
env: env:
- BTYPE="BINARY=32" - BTYPE="BINARY=32"

View File

@ -42,6 +42,19 @@ endif()
####### #######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
@ -62,10 +75,10 @@ endif ()
set(SUBDIRS ${BLASDIRS}) set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK) if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src) list(APPEND SUBDIRS relapack/src)
endif() endif()
list(APPEND SUBDIRS lapack)
endif () endif ()
# set which float types we want to build for # set which float types we want to build for
@ -134,7 +147,7 @@ endif ()
# Only generate .def for dll on MSVC and always produce pdb files for debug and release # Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC) if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif() endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH})
endforeach() endforeach()
endif () endif ()
# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()
# add objects to the openblas lib # add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
# Android needs to explicitly link against libm # Android needs to explicitly link against libm
if(ANDROID) if(ANDROID)
@ -166,7 +173,7 @@ endif()
# Handle MSVC exports # Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS) if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else() else()
# Creates verbose .def file (51KB vs 18KB) # Creates verbose .def file (51KB vs 18KB)
@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION}
) )
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH) if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64") set(ARCH_IN "x86_64")
@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN) if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@ -327,10 +342,11 @@ endif()
if(NOT NO_CBLAS) if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif() endif()
if(NOT NO_LAPACKE) if(NOT NO_LAPACKE)

View File

@ -96,7 +96,7 @@ endif
@echo @echo
shared : shared :
ifndef NO_SHARED ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so

View File

@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif endif
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif

View File

@ -58,14 +58,14 @@ ifndef NO_LAPACKE
endif endif
#for install static library #for install static library
ifndef NO_STATIC ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif endif
#for install shared library #for install shared library
ifndef NO_SHARED ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@ -106,14 +106,14 @@ ifndef NO_LAPACKE
endif endif
#for install static library #for install static library
ifndef NO_STATIC ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif endif
#for install shared library #for install shared library
ifndef NO_SHARED ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@ -138,7 +138,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED ifneq ($(NO_SHARED),1)
#ifeq logical or #ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -48,6 +48,8 @@ VERSION = 0.3.6.dev
# HOSTCC = gcc # HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# Please note that AVX is not available on 32-bit.
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
# BINARY=64 # BINARY=64
# About threaded BLAS. It will be automatically detected if you don't # About threaded BLAS. It will be automatically detected if you don't
@ -57,7 +59,7 @@ VERSION = 0.3.6.dev
# USE_THREAD = 0 # USE_THREAD = 0
# If you're going to use this library with OpenMP, please comment it in. # If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1 # USE_OPENMP = 1
# The OpenMP scheduler to use - by default this is "static" and you # The OpenMP scheduler to use - by default this is "static" and you
@ -68,36 +70,45 @@ VERSION = 0.3.6.dev
# allow you to select the scheduler from the environment variable OMP_SCHEDULE # allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic # CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define maximum number of threads. Basically it should be # You can define the maximum number of threads. Basically it should be less
# less than actual number of cores. If you don't specify one, it's # than or equal to the number of CPU threads. If you don't specify one, it's
# automatically detected by the the script. # automatically detected by the build system.
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
# detection includes logical CPUs, thus allowing the use of SMT.
# Users may opt at runtime to use less than NUM_THREADS threads.
#
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
# footprint penalty, even if users reduce the actual number of threads at runtime.
# NUM_THREADS = 24 # NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call # If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in. # OpenBLAS's calculation API from multiple threads, please comment this in.
# This flag defines how many instances of OpenBLAS's calculation API can # This flag defines how many instances of OpenBLAS's calculation API can actually
# actually run in parallel. If more threads call OpenBLAS's calculation API, # run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption. # they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2 # NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in. # If you don't need to install the static library, please comment this in.
# NO_STATIC = 1 # NO_STATIC = 1
# if you don't need generate the shared library, please comment it in. # If you don't need to generate the shared library, please comment this in.
# NO_SHARED = 1 # NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in. # If you don't need the CBLAS interface, please comment this in.
# NO_CBLAS = 1 # NO_CBLAS = 1
# If you only want CBLAS interface without installing Fortran compiler, # If you only want the CBLAS interface without installing a Fortran compiler,
# please comment it in. # please comment this in.
# ONLY_CBLAS = 1 # ONLY_CBLAS = 1
# If you don't need LAPACK, please comment it in. # If you don't need LAPACK, please comment this in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. # If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1 # NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. # If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
# NO_LAPACKE = 1 # NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0 # Build LAPACK Deprecated functions since LAPACK 3.6.0
@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK # Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1 # BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation. # If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1 # USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses # If you want to use the new, still somewhat experimental code that uses
@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1
# USE_TLS = 1 # USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran # If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you # compilers support this. It's safe to keep this commented out if you
# are not sure(equivalent to "-i8" option). # are not sure. (This is equivalent to the "-i8" ifort option).
# INTERFACE64 = 1 # INTERFACE64 = 1
# Unfortunately most of kernel won't give us high quality buffer. # Unfortunately most of kernel won't give us high quality buffer.
@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1
# but it will consume time. If you don't like it, you can disable one. # but it will consume time. If you don't like it, you can disable one.
NO_WARMUP = 1 NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux. # Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
# This feature is only implemented on Linux, and is always disabled on other platforms.
# Enabling affinity handling may improve performance, especially on NUMA systems, but
# it may conflict with certain applications that also try to manage affinity.
# This conflict can result in threads of the application calling OpenBLAS ending up locked
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
# else modifies affinity settings.
# Note: enabling affinity has been known to cause problems with NumPy and R
NO_AFFINITY = 1 NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1 # BIGNUMA = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@ -180,7 +199,7 @@ NO_AFFINITY = 1
# been reported to be optimal for certain workloads (50 is the recommended value for Julia). # been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4 # GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very # If you need sanity check by comparing results to reference BLAS. It'll be very
# slow (Not implemented yet). # slow (Not implemented yet).
# SANITY_CHECK = 1 # SANITY_CHECK = 1

View File

@ -95,6 +95,9 @@ endif
ifeq ($(TARGET), ZEN) ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA GETARCH_FLAGS := -DFORCE_BARCELONA
endif endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
endif endif
@ -152,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX
endif endif
ifeq ($(BINARY), 32) ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
endif endif
ifeq ($(NO_AVX2), 1) ifeq ($(NO_AVX2), 1)

View File

@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector
endif endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@ -91,7 +91,9 @@ CORTEXA73
FALKOR FALKOR
THUNDERX THUNDERX
THUNDERX2T99 THUNDERX2T99
TSV110
9.System Z: 9.System Z:
ZARCH_GENERIC ZARCH_GENERIC
Z13 Z13
Z14

View File

@ -53,9 +53,9 @@ before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build } - ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build - cd build
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
build_script: build_script:
- cmake --build . - cmake --build .

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE) argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128 nfrom <- 128
nto <- 2048 nto <- 2048
nstep <- 128 nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z]) loops <- as.numeric(argv[z])
} }
} }
} }
p <- Sys.getenv("OPENBLAS_LOOPS") p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,29 +28,21 @@ if (p != "") {
loops <- as.numeric(p) loops <- as.numeric(p)
} }
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n")) cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom n <- nfrom
while (n <= nto) { while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n) A <- matrix(rnorm(n * n), nrow = n)
ev <- 0 ev <- 0
z <- system.time(for (l in 1:loops) { z <- system.time(for (l in 1:loops) {
ev <- eigen(A) ev <- eigen(A)
}) })
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n) st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep n <- n + nstep
} }

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE) argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128 nfrom <- 128
nto <- 2048 nto <- 2048
nstep <- 128 nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z]) loops <- as.numeric(argv[z])
} }
} }
} }
p <- Sys.getenv("OPENBLAS_LOOPS") p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,26 +28,13 @@ if (p != "") {
loops <- as.numeric(p) loops <- as.numeric(p)
} }
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n")) cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom n <- nfrom
while (n <= nto) { while (n <= nto) {
A <- matrix(runif(n * n), A <- matrix(runif(n * n), nrow = n)
ncol = n, B <- matrix(runif(n * n), nrow = n)
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
C <- 1 C <- 1
z <- system.time(for (l in 1:loops) { z <- system.time(for (l in 1:loops) {
@ -54,11 +42,10 @@ while (n <= nto) {
l <- l + 1 l <- l + 1
}) })
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n) st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep n <- n + nstep
} }

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE) argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128 nfrom <- 128
nto <- 2048 nto <- 2048
nstep <- 128 nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z]) loops <- as.numeric(argv[z])
} }
} }
} }
p <- Sys.getenv("OPENBLAS_LOOPS") p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,31 +28,22 @@ if (p != "") {
loops <- as.numeric(p) loops <- as.numeric(p)
} }
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n")) cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom n <- nfrom
while (n <= nto) { while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n) A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n) B <- matrix(rnorm(n * n), nrow = n)
z <- system.time(for (l in 1:loops) { z <- system.time(for (l in 1:loops) {
solve(A, B) solve(A, B)
}) })
mflops <- mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
st <- sprintf("%.0fx%.0f :", n, n) st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep n <- n + nstep
} }

45
c_check
View File

@ -1,7 +1,7 @@
#!/usr/bin/perl #!/usr/bin/perl
use File::Basename; #use File::Basename;
use File::Temp qw(tempfile); # use File::Temp qw(tempfile);
# Checking cross compile # Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x"); $hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 ); #$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"}; $binary = $ENV{"BINARY"};
$makefile = shift(@ARGV); $makefile = shift(@ARGV);
@ -31,12 +31,25 @@ if ($?) {
$cross_suffix = ""; $cross_suffix = "";
if (dirname($compiler_name) ne ".") { eval "use File::Basename";
$cross_suffix .= dirname($compiler_name) . "/"; if ($@){
} warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1; $cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} }
$compiler = ""; $compiler = "";
@ -171,6 +184,11 @@ if ($?) {
$have_msa = 0; $have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) { if (($architecture eq "mips") || ($architecture eq "mips64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$tmpf = new File::Temp( UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"'; $code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n"; print $tmpf "#include <msa.h>\n\n";
@ -185,6 +203,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
$have_msa = 1; $have_msa = 1;
} }
unlink("$tmpf.o"); unlink("$tmpf.o");
}
} }
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);
@ -204,9 +223,16 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0; $no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) { if (($architecture eq "x86") || ($architecture eq "x86_64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0; system(@cmd) == 0;
if ($? != 0) { if ($? != 0) {
@ -215,6 +241,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$no_avx512 = 0; $no_avx512 = 0;
} }
unlink("tmpf.o"); unlink("tmpf.o");
}
} }
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;

View File

@ -74,6 +74,9 @@ if (DYNAMIC_ARCH)
if (NOT NO_AVX512) if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif () endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif () endif ()
if (NOT DYNAMIC_CORE) if (NOT DYNAMIC_CORE)

View File

@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA") set(TARGET "BARCELONA")
endif () endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
set(TARGET "ARMV7")
endif ()
endif () endif ()
if (DEFINED TARGET) if (DEFINED TARGET)
@ -184,6 +187,13 @@ if (DYNAMIC_ARCH)
endif () endif ()
endif () endif ()
if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif ()
if (NO_LAPACK) if (NO_LAPACK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
#Disable LAPACK C interface #Disable LAPACK C interface

View File

@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1) set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1) set(X86_64 1)
else()
set(X86 1)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1) set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
@ -78,7 +82,7 @@ endif()
if (X86_64 OR X86) if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1) if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif() endif()

View File

@ -444,7 +444,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH]; typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0 #define readenv(p, n) 0
#else #else
#ifdef OS_WINDOWS #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
typedef char env_var_t[MAX_PATH]; typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else #else

View File

@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH #define HAVE_PREFETCH
#endif #endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
#define DCBT_ARG 0 #define DCBT_ARG 0
#else #else
#define DCBT_ARG 8 #define DCBT_ARG 8
@ -598,9 +598,14 @@ REALNAME:;\
#ifndef __64BIT__ #ifndef __64BIT__
#define PROLOGUE \ #define PROLOGUE \
.machine "any";\ .machine "any";\
.toc;\
.globl .REALNAME;\ .globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\ .csect .text[PR],5;\
.REALNAME:; .REALNAME:
#define EPILOGUE \ #define EPILOGUE \
_section_.text:;\ _section_.text:;\
@ -611,9 +616,14 @@ _section_.text:;\
#define PROLOGUE \ #define PROLOGUE \
.machine "any";\ .machine "any";\
.toc;\
.globl .REALNAME;\ .globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\ .csect .text[PR], 5;\
.REALNAME:; .REALNAME:
#define EPILOGUE \ #define EPILOGUE \
_section_.text:;\ _section_.text:;\

View File

@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
return result; return result;
#endif #endif

View File

@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
return result; return result;
} }

View File

@ -39,6 +39,8 @@
// Cavium // Cavium
#define CPU_THUNDERX 7 #define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8 #define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9
static char *cpuname[] = { static char *cpuname[] = {
"UNKNOWN", "UNKNOWN",
@ -49,7 +51,8 @@ static char *cpuname[] = {
"CORTEXA73", "CORTEXA73",
"FALKOR", "FALKOR",
"THUNDERX", "THUNDERX",
"THUNDERX2T99" "THUNDERX2T99",
"TSV110"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
@ -61,7 +64,8 @@ static char *cpuname_lower[] = {
"cortexa73", "cortexa73",
"falkor", "falkor",
"thunderx", "thunderx",
"thunderx2t99" "thunderx2t99",
"tsv110"
}; };
int get_feature(char *search) int get_feature(char *search)
@ -145,6 +149,9 @@ int detect(void)
return CPU_THUNDERX; return CPU_THUNDERX;
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99; return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
} }
p = (char *) NULL ; p = (char *) NULL ;
@ -286,6 +293,21 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n"); printf("#define DTB_SIZE 4096 \n");
break; break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
} }
} }

View File

@ -228,7 +228,7 @@ int support_avx2(){
} }
int support_avx512(){ int support_avx512(){
#ifndef NO_AVX512 #if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
int ret=0; int ret=0;
@ -1359,6 +1359,8 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
// Apollo Lake // Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
@ -1378,7 +1380,7 @@ int get_cpuname(void){
case 9: case 9:
case 8: case 8:
switch (model) { switch (model) {
case 14: // Kaby Lake case 14: // Kaby Lake and refreshes
if(support_avx2()) if(support_avx2())
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
if(support_avx()) if(support_avx())

View File

@ -64,10 +64,8 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
/* detect z14, but fall back to z13 */ if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
return CPU_GENERIC; return CPU_GENERIC;
} }
@ -116,7 +114,14 @@ void get_cpuconfig(void)
break; break;
case CPU_Z14: case CPU_Z14:
printf("#define Z14\n"); printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break; break;
} }
} }

View File

@ -113,7 +113,7 @@ ARCH_X86
ARCH_X86_64 ARCH_X86_64
#endif #endif
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
ARCH_POWER ARCH_POWER
#endif #endif

View File

@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m; if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m; if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;

View File

@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed); SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){ for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE); WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE #ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0); TerminateThread(blas_threads[i],0);
#endif #endif
CloseHandle(blas_threads[i]);
} }
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0; blas_server_avail = 0;
} }

View File

@ -322,7 +322,7 @@ int support_avx2(){
} }
int support_avx512(){ int support_avx512(){
#ifndef NO_AVX512 #if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
int ret=0; int ret=0;
@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
//Apollo Lake //Apollo Lake or Denverton
if (model == 12) { if (model == 12 || model == 15) {
return &gotoblas_NEHALEM; return &gotoblas_NEHALEM;
} }
return NULL; return NULL;

View File

@ -198,10 +198,18 @@ int get_num_procs(void);
#else #else
int get_num_procs(void) { int get_num_procs(void) {
static int nums = 0; static int nums = 0;
cpu_set_t *cpusetp; cpu_set_t cpuset,*cpusetp;
size_t size; size_t size;
int ret; int ret;
int i,n;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX) #if !defined(OS_LINUX)
@ -216,27 +224,42 @@ int i,n;
#endif #endif
#if !__GLIBC_PREREQ(2, 7) #if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums; if (ret!=0) return nums;
n=0; n=0;
#if !__GLIBC_PREREQ(2, 6) #if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++) for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++; if (CPU_ISSET(i,cpuset)) n++;
nums=n; nums=n;
#else #else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif #endif
return nums; return nums;
#else #else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums); cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums; if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums); size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp); ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums; if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp); ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret; if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp); CPU_FREE(cpusetp);
return nums; return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif #endif
#endif #endif
} }
@ -1290,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
free(map_address); free(map_address);
} }
#ifdef SMP
void blas_thread_memory_cleanup(void) {
blas_memory_cleanup((void*)get_memory_table());
}
#endif
void blas_shutdown(void){ void blas_shutdown(void){
#ifdef SMP #ifdef SMP
BLASFUNC(blas_thread_shutdown)(); BLASFUNC(blas_thread_shutdown)();
@ -1299,7 +1329,7 @@ void blas_shutdown(void){
/* Only cleanupIf we were built for threading and TLS was initialized */ /* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key) if (local_storage_key)
#endif #endif
blas_memory_cleanup((void*)get_memory_table()); blas_thread_memory_cleanup();
#ifdef SEEK_ADDRESS #ifdef SEEK_ADDRESS
base_address = 0UL; base_address = 0UL;
@ -1529,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
break; break;
case DLL_THREAD_DETACH: case DLL_THREAD_DETACH:
#if defined(SMP) #if defined(SMP)
blas_memory_cleanup((void*)get_memory_table()); blas_thread_memory_cleanup();
#endif #endif
break; break;
case DLL_PROCESS_DETACH: case DLL_PROCESS_DETACH:
@ -1603,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) {
#endif #endif
#else #else
/* USE_TLS / COMPILE_TLS not set */
#include <errno.h> #include <errno.h>
#ifdef OS_WINDOWS #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#define ALLOC_WINDOWS #define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES #ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000 #define MEM_LARGE_PAGES 0x20000000
@ -1619,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <stdio.h> #include <stdio.h>
#include <fcntl.h> #include <fcntl.h>
#ifndef OS_WINDOWS #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#include <sys/mman.h> #include <sys/mman.h>
#ifndef NO_SYSV_IPC #ifndef NO_SYSV_IPC
#include <sys/shm.h> #include <sys/shm.h>
@ -1639,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <sys/resource.h> #include <sys/resource.h>
#endif #endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN) #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/resource.h> #include <sys/resource.h>
#endif #endif
@ -1678,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) {
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor)) #define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor)) #define DESTRUCTOR __attribute__ ((destructor))
#else #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101))) #define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif #endif
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
@ -1704,11 +1739,20 @@ void goto_set_num_threads(int num_threads) {};
int get_num_procs(void); int get_num_procs(void);
#else #else
int get_num_procs(void) { int get_num_procs(void) {
static int nums = 0; static int nums = 0;
cpu_set_t *cpusetp; cpu_set_t cpuset,*cpusetp;
size_t size; size_t size;
int ret; int ret;
int i,n;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX) #if !defined(OS_LINUX)
@ -1723,26 +1767,42 @@ int i,n;
#endif #endif
#if !__GLIBC_PREREQ(2, 7) #if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums; if (ret!=0) return nums;
n=0; n=0;
#if !__GLIBC_PREREQ(2, 6) #if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++) for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++; if (CPU_ISSET(i,cpuset)) n++;
nums=n; nums=n;
#else #else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif #endif
return nums; return nums;
#else #else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums); cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums; if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums); size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp); ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums; if (ret!=0) {
nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp); CPU_FREE(cpusetp);
return nums; return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif #endif
#endif #endif
} }
@ -1793,7 +1853,7 @@ int get_num_procs(void) {
#endif #endif
#if defined(OS_FREEBSD) #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) { int get_num_procs(void) {
@ -1870,7 +1930,7 @@ void openblas_fork_handler()
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another // In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP. // implementation of OpenMP.
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err; int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0) if(err != 0)
@ -1883,7 +1943,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env(); extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){ int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num; int max_num;
#endif #endif
int blas_goto_num = 0; int blas_goto_num = 0;
@ -1891,11 +1951,11 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads; if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs(); max_num = get_num_procs();
#endif #endif
blas_goto_num = 0; // blas_goto_num = 0;
#ifndef USE_OPENMP #ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env(); blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0; if (blas_goto_num < 0) blas_goto_num = 0;
@ -1907,7 +1967,7 @@ int blas_get_cpu_number(void){
#endif #endif
blas_omp_num = 0; // blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env(); blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0; if (blas_omp_num < 0) blas_omp_num = 0;
@ -1915,7 +1975,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER; else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num; if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif #endif
@ -2002,11 +2062,15 @@ static void *alloc_mmap(void *address){
} }
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
} }
#ifdef OS_LINUX #ifdef OS_LINUX
@ -2148,14 +2212,18 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP) #if defined(OS_LINUX) && !defined(NO_WARMUP)
} }
#endif #endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
} #if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
}
return map_address; return map_address;
} }
@ -2523,7 +2591,7 @@ void *blas_memory_alloc(int procpos){
int position; int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP) #if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos; int mypos = 0;
#endif #endif
void *map_address; void *map_address;
@ -2554,6 +2622,11 @@ void *blas_memory_alloc(int procpos){
NULL, NULL,
}; };
void *(**func)(void *address); void *(**func)(void *address);
#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) { if (!memory_initialized) {
@ -2589,6 +2662,9 @@ void *blas_memory_alloc(int procpos){
} }
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Alloc Start ...\n"); printf("Alloc Start ...\n");
@ -2603,13 +2679,17 @@ void *blas_memory_alloc(int procpos){
do { do {
if (!memory[position].used && (memory[position].pos == mypos)) { if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
// blas_lock(&memory[position].lock); #else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation; if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
// blas_unlock(&memory[position].lock); #else
blas_unlock(&memory[position].lock);
#endif
} }
position ++; position ++;
@ -2621,21 +2701,26 @@ void *blas_memory_alloc(int procpos){
position = 0; position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif
do { do {
/* if (!memory[position].used) { */ #if defined(USE_OPENMP)
/* blas_lock(&memory[position].lock);*/ if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation; if (!memory[position].used) goto allocation;
/* blas_unlock(&memory[position].lock);*/ #if defined(USE_OPENMP)
/* } */ blas_unlock(&memory[position].lock);
}
#endif
position ++; position ++;
} while (position < NUM_BUFFERS); } while (position < NUM_BUFFERS);
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
goto error; goto error;
allocation : allocation :
@ -2645,10 +2730,11 @@ void *blas_memory_alloc(int procpos){
#endif #endif
memory[position].used = 1; memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/ #else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) { if (!memory[position].addr) {
do { do {
#ifdef DEBUG #ifdef DEBUG
@ -2693,9 +2779,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1); } while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address; memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG #ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@ -2749,8 +2839,9 @@ void blas_memory_free(void *free_area){
#endif #endif
position = 0; position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++; position++;
@ -2764,7 +2855,9 @@ void blas_memory_free(void *free_area){
WMB; WMB;
memory[position].used = 0; memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Unmap Succeeded.\n\n"); printf("Unmap Succeeded.\n\n");
@ -2779,8 +2872,9 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++) for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif #endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
return; return;
} }

View File

@ -141,6 +141,14 @@ else
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
endif endif
ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
ifneq ($(C_COMPILER), LSB) ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,--whole-archive $< -Wl,--no-whole-archive \
@ -152,6 +160,7 @@ else
-Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) -Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
endif endif
rm -f linktest rm -f linktest

View File

@ -40,15 +40,25 @@
void gotoblas_init(void); void gotoblas_init(void);
void gotoblas_quit(void); void gotoblas_quit(void);
#if defined(SMP) && defined(USE_TLS)
void blas_thread_memory_cleanup(void);
#endif
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
switch(reason) {
if (reason == DLL_PROCESS_ATTACH) { case DLL_PROCESS_ATTACH:
gotoblas_init(); gotoblas_init();
} break;
case DLL_PROCESS_DETACH:
if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit(); gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP) && defined(USE_TLS)
blas_thread_memory_cleanup();
#endif
break;
} }
return TRUE; return TRUE;

View File

@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <unistd.h> #include <unistd.h>
#endif #endif
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#define NO_AVX512
#endif
/* #define FORCE_P2 */ /* #define FORCE_P2 */
/* #define FORCE_KATMAI */ /* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */ /* #define FORCE_COPPERMINE */
@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef FORCE_SKYLAKEX #ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "skylakex" #define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX" #define CORENAME "SKYLAKEX"
#endif #endif
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
@ -1058,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#endif #endif
#ifdef FORCE_TSV110
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "TSV110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTSV110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC #ifdef FORCE_ZARCH_GENERIC
#define FORCE #define FORCE
#define ARCHITECTURE "ZARCH" #define ARCHITECTURE "ZARCH"
@ -1078,6 +1114,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13" #define CORENAME "Z13"
#endif #endif
#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif
#ifndef FORCE #ifndef FORCE
#ifdef USER_TARGET #ifdef USER_TARGET

View File

@ -218,10 +218,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
buffer = (FLOAT *)blas_memory_alloc(1); buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP #ifdef SMP
/* nthreads = num_cpu_avail(2); nthreads = num_cpu_avail(2);
FIXME trmv_thread was found to be broken, see issue 1332 */
nthreads = 1;
if (nthreads == 1) { if (nthreads == 1) {
#endif #endif

View File

@ -81,6 +81,12 @@
#endif #endif
#endif #endif
#ifndef COMPLEX
#define SMP_FACTOR 256
#else
#define SMP_FACTOR 128
#endif
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef TRMM #ifndef TRMM
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
@ -366,10 +372,14 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) /*
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
*/
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = 1; args.nthreads = 1;
else else
args.nthreads = num_cpu_avail(3); args.nthreads = num_cpu_avail(3);

View File

@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else } else
nthreads = 1; nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) { if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40; buffer_size = n > 16 ? 0 : n * 4 + 40;
} }

View File

@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1 USE_TRMM = 1
endif endif
ifeq ($(TARGET), GENERIC) ifeq ($(CORE), GENERIC)
USE_TRMM = 1 USE_TRMM = 1
endif endif
@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch)
USE_TRMM = 1 USE_TRMM = 1
endif endif
ifeq ($(CORE), Z14)
USE_TRMM = 1
endif

View File

@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n) while(i < n)
{ {
if( x[ix] > minf ) if( x[ix] < minf )
{ {
min = i; min = i;
minf = x[ix]; minf = x[ix];

175
kernel/arm64/KERNEL.TSV110 Normal file
View File

@ -0,0 +1,175 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n) while(i < n)
{ {
if( x[ix] > minf ) if( x[ix] < minf )
{ {
min = i; min = i;
minf = x[ix]; minf = x[ix];

View File

@ -129,7 +129,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1) STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1) STFD f0, 15 * SIZE(CO1)
dcbst PRE, CO1 dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE addi CO1, CO1, 16 * SIZE
bdnz LL(12) bdnz LL(12)
.align 4 .align 4

View File

@ -134,7 +134,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1) STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1) STFD f0, 15 * SIZE(CO1)
dcbst PRE, CO1 dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE addi CO1, CO1, 16 * SIZE
bdnz LL(12) bdnz LL(12)
.align 4 .align 4

View File

@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha), // 4 "r" (alpha), // 4
@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha), // 4 "r" (alpha), // 4

View File

@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha), // 4 "r" (alpha), // 4

View File

@ -96,9 +96,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha), // 4 "r" (alpha), // 4

View File

@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha), // 4 "r" (alpha), // 4
@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha), // 4 "r" (alpha), // 4

View File

@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
@ -176,9 +176,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -106,9 +106,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
@ -176,9 +176,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"%0", "%1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"%0", "%1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"%0", "%1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"%0", "%1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"0", "1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", // "0", "1", : "cc", // "0", "1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"%0", "%1", : "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"0", "1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"0", "1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -209,11 +209,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"0", "1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"%0", "%1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n), // 0
"+r" (x) // 1
: :
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2 "r" (alpha) // 2
: "cc", //"0", "1", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -60,9 +60,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -79,9 +79,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
@ -141,9 +141,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -100,9 +100,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -79,9 +79,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
@ -141,9 +141,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -66,9 +66,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t" "vmovsd %%xmm4, (%4) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -76,9 +76,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"movsd %%xmm4, (%4) \n\t" "movsd %%xmm4, (%4) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -82,9 +82,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
@ -146,9 +146,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -82,9 +82,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -79,9 +79,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (ap[0]), // 4 "r" (ap[0]), // 4
@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (ap), // 4 "r" (ap), // 4

View File

@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7 "r" (ap[3]), // 7
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm8", "%xmm9",

View File

@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__ __asm__ __volatile__
( (
"vzeroupper \n\t" "vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd (%3), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz 2f \n\t" "jz 2f \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
"addq $4 , %8 \n\t" "addq $4 , %2 \n\t"
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $8 , %8 \n\t" "addq $8 , %2 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
"jnz 1b \n\t" "jnz 1b \n\t"
@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
: :
"+r" (i), // 0 "+r" (i), // 0
"+r" (n) // 1 "+r" (n), // 1
"+r" (lda4) // 2
: :
"r" (x), // 2 "r" (x), // 3
"r" (y), // 3 "r" (y), // 4
"r" (ap[0]), // 4 "r" (ap[0]), // 5
"r" (ap[1]), // 5 "r" (ap[1]), // 6
"r" (ap[2]), // 6 "r" (ap[2]), // 7
"r" (ap[3]), // 7 "r" (ap[3]), // 8
"r" (lda4), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1",

View File

@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movsd %%xmm11,8(%2) \n\t" "movsd %%xmm11,8(%2) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2 "r" (y), // 2
"r" (ap0), // 3 "r" (ap0), // 3
"r" (ap1), // 4 "r" (ap1), // 4
@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movsd %%xmm10, (%2) \n\t" "movsd %%xmm10, (%2) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2 "r" (y), // 2
"r" (ap), // 3 "r" (ap), // 3
"r" (x) // 4 "r" (x) // 4
@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (&da), // 2 "r" (&da), // 2
"r" (src), // 3 "r" (src), // 3
"r" (dest) // 4 "r" (dest) // 4

View File

@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (n) // 0
: :
"r" (n), // 0
"r" (x), // 1 "r" (x), // 1
"r" (x1), // 2 "r" (x1), // 2
"r" (alpha), // 3 "r" (alpha), // 3

View File

@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n1), // 0
"+r" (x) // 1
: :
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2 "r" (alpha), // 2
"r" (n2) // 3 "r" (n2) // 3
: "cc", : "cc",
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n1), // 0
"+r" (x) // 1
: :
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2 "r" (alpha), // 2
"r" (n2) // 3 "r" (n2) // 3
: "cc", : "cc",

View File

@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n1), // 0
"+r" (x) // 1
: :
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2 "r" (alpha), // 2
"r" (n2) // 3 "r" (n2) // 3
: "cc", : "cc",
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n1), // 0
"+r" (x) // 1
: :
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2 "r" (alpha), // 2
"r" (n2) // 3 "r" (n2) // 3
: "cc", : "cc",

View File

@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n1), // 0
"+r" (x) // 1
: :
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2 "r" (alpha), // 2
"r" (n2) // 3 "r" (n2) // 3
: "cc", : "cc",
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (n1), // 0
"+r" (x) // 1
: :
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2 "r" (alpha), // 2
"r" (n2) // 3 "r" (n2) // 3
: "cc", : "cc",

View File

@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
: :
"+r" (from) // 0
: :
"r" (from), // 0
"r" (to), // 1 "r" (to), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3

View File

@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (from) // 0
: :
"r" (from), // 0
"r" (to), // 1 "r" (to), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3

View File

@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movsd %%xmm3 , 24(%9) \n\t" // save temp2 "movsd %%xmm3 , 24(%9) \n\t" // save temp2
: :
"+r" (from) // 0
: :
"r" (from), // 0
"r" (to), // 1 "r" (to), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3

View File

@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (from) // 0
: :
"r" (from), // 0
"r" (to), // 1 "r" (to), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3

View File

@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (a0), // 4 "r" (a0), // 4

View File

@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (a0), // 4 "r" (a0), // 4

View File

@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movsd %%xmm3 , 24(%9) \n\t" // save temp2 "movsd %%xmm3 , 24(%9) \n\t" // save temp2
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (a0), // 4 "r" (a0), // 4

View File

@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (a0), // 4 "r" (a0), // 4

View File

@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t" " cmpq $0, %0 \n\t"
" je 4f \n\t" " je 4f \n\t"
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" addq $8, %1 \n\t" " addq $8, %1 \n\t"
@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t" " .p2align 4 \n\t"
"1: \n\t" "1: \n\t"
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 22f \n\t" " jz 22f \n\t"
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
" vmovups (%9), %%ymm0 \n\t" " vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
" vmovups 32(%9), %%ymm4 \n\t" " vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"5: \n\t" // i = 0 "5: \n\t" // i = 0
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t" " vmovups (%3), %%ymm0 \n\t"
" vmovups %%ymm8 , (%8) \n\t" // write a " vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c " vmovups %%ymm8 , (%4) \n\t" // write c
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t" " vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm9 , (%8) \n\t" // write a " vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t" " vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm10, (%8) \n\t" // write a " vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c " vmovups %%ymm10, (%4,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm11, (%8) \n\t" // write a " vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c " vmovups %%ymm11, (%5) \n\t" // write c
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm12, (%8) \n\t" // write a " vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c " vmovups %%ymm12, (%5,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm13, (%8) \n\t" // write a " vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c " vmovups %%ymm13, (%5,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8 " addq $64, %3 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t" " vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm14, (%8) \n\t" // write a " vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c " vmovups %%ymm14, (%6) \n\t" // write c
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
" addq $32, %8 \n\t" // a=a+8 " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
" vmovups %%ymm15, (%8) \n\t" // write a " vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c " vmovups %%ymm15, (%6,%7,1) \n\t" // write c
" vzeroupper \n\t" " vzeroupper \n\t"
: :
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
: :
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4 "r" (c), // 4
"r" (c3), // 5 "r" (c3), // 5
"r" (c6), // 6 "r" (c6), // 6
"r" (ldc), // 7 "r" (ldc), // 7
"r" (as), // 8 "r" (a), // 8
"r" (bs) // 9 "r" (b) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t" " .align 16 \n\t"
"1: \n\t" "1: \n\t"
" prefetcht0 384(%2,%1,8) \n\t" " prefetcht0 384(%6,%1,8) \n\t"
" prefetcht0 384(%3,%1,8) \n\t" " prefetcht0 384(%7,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t" " vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t" " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t" " jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t" " prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t" " vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t" " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t" " jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t" " prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t" " vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t" " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t" " jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t" " prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups (%2,%1,8), %%xmm4 \n\t" " vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t" " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 1 "3: \n\t" // i = 1
" vmovddup (%7), %%xmm1 \n\t" // read b " vmovddup (%3), %%xmm1 \n\t" // read b
" vmovddup 8(%7), %%xmm0 \n\t" // read bb " vmovddup 8(%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
" vmovups %%xmm12 , (%6) \n\t" // write a " vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a " vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a " vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t" " vmovups %%xmm13 , 16(%5) \n\t"
@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
" \n\t" // i = 0 " \n\t" // i = 0
" subq $16 , %7 \n\t" // b = b - 2 " subq $16 , %3 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 8 " subq $64 , %2 \n\t" // a = a - 8
" vmovddup (%7), %%xmm0 \n\t" // read bb " vmovddup (%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
" vmovups %%xmm8 , (%6) \n\t" // write a " vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t" " vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t" " vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t" " vmovups %%xmm9 , 16(%4) \n\t"
@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t" " vzeroupper \n\t"
: :
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
: :
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4 "r" (c), // 4
"r" (c1), // 5 "r" (c1), // 5
"r" (as), // 6 "r" (a), // 6
"r" (bs) // 7 "r" (b) // 7
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@ -135,7 +135,7 @@
#endif #endif
movq %rsp, %rbx # save old stack movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack andq $-4096, %rsp # align stack
STACK_TOUCHING STACK_TOUCHING

View File

@ -383,7 +383,7 @@
EMMS EMMS
movq %rsp, %rbx # save old stack movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack andq $-4096, %rsp # align stack
STACK_TOUCHING STACK_TOUCHING

View File

@ -60,9 +60,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -79,9 +79,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
@ -140,9 +140,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -100,9 +100,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4

View File

@ -67,9 +67,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t" "vmovss %%xmm4, (%4) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -80,9 +80,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -76,9 +76,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"movss %%xmm4, (%4) \n\t" "movss %%xmm4, (%4) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -83,9 +83,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t" "vzeroupper \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -81,9 +81,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t" "vmovss %%xmm4, (%4) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
@ -144,9 +144,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t" "vmovss %%xmm4, (%4) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4

View File

@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (ap[0]), // 4 "r" (ap[0]), // 4
@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"3: \n\t" "3: \n\t"
: :
"+r" (i), // 0
"+r" (n1) // 1
: :
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (ap), // 4 "r" (ap), // 4
@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (src), // 2 "r" (src), // 2
"r" (dest) // 3 "r" (dest) // 3
: "cc", : "cc",

View File

@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__ __asm__ __volatile__
( (
"vbroadcastss (%2), %%xmm12 \n\t" // x0 "vbroadcastss (%3), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1 "vbroadcastss 4(%3), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2 "vbroadcastss 8(%3), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3 "vbroadcastss 12(%3), %%xmm15 \n\t" // x3
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4 "vbroadcastss 16(%3), %%xmm0 \n\t" // x4
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5 "vbroadcastss 20(%3), %%xmm1 \n\t" // x5
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6 "vbroadcastss 24(%3), %%xmm2 \n\t" // x6
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7 "vbroadcastss 28(%3), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha "vbroadcastss (%9), %%xmm8 \n\t" // alpha
@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %8 \n\t" "addq $4 , %2 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
"2: \n\t" "2: \n\t"
@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"addq $8 , %8 \n\t" "addq $8 , %2 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t" "prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t" "prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t" "prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t" ".align 2 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%4,%8,4) \n\t" "prefetcht0 192(%5,%2,4) \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%5,%8,4) \n\t" "prefetcht0 192(%6,%2,4) \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t" "prefetcht0 192(%7,%2,4) \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%7,%8,4) \n\t" "prefetcht0 192(%8,%2,4) \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
"addq $16, %8 \n\t" "addq $16, %2 \n\t"
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
: :
"+r" (i), // 0 "+r" (i), // 0
"+r" (n) // 1 "+r" (n), // 1
"+r" (lda4) // 2
: :
"r" (x), // 2 "r" (x), // 3
"r" (y), // 3 "r" (y), // 4
"r" (ap[0]), // 4 "r" (ap[0]), // 5
"r" (ap[1]), // 5 "r" (ap[1]), // 6
"r" (ap[2]), // 6 "r" (ap[2]), // 7
"r" (ap[3]), // 7 "r" (ap[3]), // 8
"r" (lda4), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1",

View File

@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#define HAVE_KERNEL_4x8 1 #define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__ __asm__ __volatile__
( (
"vzeroupper \n\t" "vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0 "vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3 "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4 "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5 "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7 "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha "vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz 2f \n\t" "jz 2f \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
"addq $4 , %8 \n\t" "addq $4 , %2 \n\t"
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz 3f \n\t" "jz 3f \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
"addq $8 , %8 \n\t" "addq $8 , %2 \n\t"
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $16, %8 \n\t" "addq $16, %2 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
"jnz 1b \n\t" "jnz 1b \n\t"
@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
: :
"+r" (i), // 0 "+r" (i), // 0
"+r" (n) // 1 "+r" (n), // 1
"+r" (lda4) // 2
: :
"r" (x), // 2 "r" (x), // 3
"r" (y), // 3 "r" (y), // 4
"r" (ap[0]), // 4 "r" (ap[0]), // 5
"r" (ap[1]), // 5 "r" (ap[1]), // 6
"r" (ap[2]), // 6 "r" (ap[2]), // 7
"r" (ap[3]), // 7 "r" (ap[3]), // 8
"r" (lda4), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1",
@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
} }
#define HAVE_KERNEL_4x4 1 #define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha "vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz 2f \n\t" "jz 2f \n\t"

View File

@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__ __asm__ __volatile__
( (
"movss (%2), %%xmm12 \n\t" // x0 "movss (%3), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1 "movss 4(%3), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2 "movss 8(%3), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3 "movss 12(%3), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t" "shufps $0, %%xmm15, %%xmm15\n\t"
"movss 16(%2), %%xmm0 \n\t" // x4 "movss 16(%3), %%xmm0 \n\t" // x4
"movss 20(%2), %%xmm1 \n\t" // x5 "movss 20(%3), %%xmm1 \n\t" // x5
"movss 24(%2), %%xmm2 \n\t" // x6 "movss 24(%3), %%xmm2 \n\t" // x6
"movss 28(%2), %%xmm3 \n\t" // x7 "movss 28(%3), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t" "shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t" "shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t" "shufps $0, %%xmm2 , %%xmm2 \n\t"
@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"1: \n\t" "1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t" "xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
".p2align 1 \n\t" ".p2align 1 \n\t"
"movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t" "movups (%6,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t" "movups (%7,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t" "movups (%8,%0,4), %%xmm11 \n\t"
".p2align 1 \n\t" ".p2align 1 \n\t"
"mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t" "mulps %%xmm13, %%xmm9 \n\t"
@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t" "addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t" "addps %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,4), %%xmm8 \n\t" "movups (%5,%2,4), %%xmm8 \n\t"
"movups (%5,%8,4), %%xmm9 \n\t" "movups (%6,%2,4), %%xmm9 \n\t"
"movups (%6,%8,4), %%xmm10 \n\t" "movups (%7,%2,4), %%xmm10 \n\t"
"movups (%7,%8,4), %%xmm11 \n\t" "movups (%8,%2,4), %%xmm11 \n\t"
".p2align 1 \n\t" ".p2align 1 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm1 , %%xmm9 \n\t"
@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t" "addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t" "addps %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t" "addq $4 , %2 \n\t"
"addps %%xmm5 , %%xmm4 \n\t" "addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t" "mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t" "addps %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0 "+r" (i), // 0
"+r" (n) // 1 "+r" (n), // 1
"+r" (lda4) // 2
: :
"r" (x), // 2 "r" (x), // 3
"r" (y), // 3 "r" (y), // 4
"r" (ap[0]), // 4 "r" (ap[0]), // 5
"r" (ap[1]), // 5 "r" (ap[1]), // 6
"r" (ap[2]), // 6 "r" (ap[2]), // 7
"r" (ap[3]), // 7 "r" (ap[3]), // 8
"r" (lda4), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1",

View File

@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__ __asm__ __volatile__
( (
"vzeroupper \n\t" "vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0 "vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3 "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4 "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5 "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7 "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha "vbroadcastss (%9), %%ymm6 \n\t" // alpha
@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
"addq $4, %8 \n\t" "addq $4, %2 \n\t"
"addq $4, %0 \n\t" "addq $4, %0 \n\t"
"subq $4, %1 \n\t" "subq $4, %1 \n\t"
@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
"addq $8, %8 \n\t" "addq $8, %2 \n\t"
"addq $8, %0 \n\t" "addq $8, %0 \n\t"
"subq $8, %1 \n\t" "subq $8, %1 \n\t"
@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t" "prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t" "prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t" "prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%4,%8,4) \n\t" "prefetcht0 192(%5,%2,4) \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%5,%8,4) \n\t" "prefetcht0 192(%6,%2,4) \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t" "prefetcht0 192(%7,%2,4) \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%7,%8,4) \n\t" "prefetcht0 192(%8,%2,4) \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
"addq $16, %8 \n\t" "addq $16, %2 \n\t"
"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
: :
"+r" (i), // 0 "+r" (i), // 0
"+r" (n) // 1 "+r" (n), // 1
"+r" (lda4) // 2
: :
"r" (x), // 2 "r" (x), // 3
"r" (y), // 3 "r" (y), // 4
"r" (ap[0]), // 4 "r" (ap[0]), // 5
"r" (ap[1]), // 5 "r" (ap[1]), // 6
"r" (ap[2]), // 6 "r" (ap[2]), // 7
"r" (ap[3]), // 7 "r" (ap[3]), // 8
"r" (lda4), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1",

View File

@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movss %%xmm11,4(%2) \n\t" "movss %%xmm11,4(%2) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2 "r" (y), // 2
"r" (ap0), // 3 "r" (ap0), // 3
"r" (ap1), // 4 "r" (ap1), // 4
@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movss %%xmm10, (%2) \n\t" "movss %%xmm10, (%2) \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2 "r" (y), // 2
"r" (ap), // 3 "r" (ap), // 3
"r" (x) // 4 "r" (x) // 4
@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t" "jnz 1b \n\t"
: :
"+r" (i), // 0
"+r" (n) // 1
: :
"r" (i), // 0
"r" (n), // 1
"r" (&da), // 2 "r" (&da), // 2
"r" (src), // 3 "r" (src), // 3
"r" (dest) // 4 "r" (dest) // 4

Some files were not shown because too many files have changed in this diff Show More