Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop
This commit is contained in:
commit
628b335e83
|
@ -149,7 +149,7 @@ matrix:
|
|||
|
||||
- &test-macos
|
||||
os: osx
|
||||
osx_image: xcode8.3
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
|
@ -160,6 +160,7 @@ matrix:
|
|||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
|
|
|
@ -42,6 +42,19 @@ endif()
|
|||
|
||||
#######
|
||||
|
||||
if(MSVC AND MSVC_STATIC_CRT)
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
||||
|
@ -62,10 +75,10 @@ endif ()
|
|||
|
||||
set(SUBDIRS ${BLASDIRS})
|
||||
if (NOT NO_LAPACK)
|
||||
list(APPEND SUBDIRS lapack)
|
||||
if(BUILD_RELAPACK)
|
||||
list(APPEND SUBDIRS relapack/src)
|
||||
endif()
|
||||
list(APPEND SUBDIRS lapack)
|
||||
endif ()
|
||||
|
||||
# set which float types we want to build for
|
||||
|
@ -134,7 +147,7 @@ endif ()
|
|||
|
||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
||||
if(MSVC)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
||||
endif()
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
||||
|
@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH})
|
|||
endforeach()
|
||||
endif ()
|
||||
|
||||
# Only build shared libs for MSVC
|
||||
if (MSVC)
|
||||
set(BUILD_SHARED_LIBS ON)
|
||||
endif()
|
||||
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
|
@ -166,7 +173,7 @@ endif()
|
|||
|
||||
# Handle MSVC exports
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||
else()
|
||||
# Creates verbose .def file (51KB vs 18KB)
|
||||
|
@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
|||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||
else()
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
|
@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
|||
if(NOT NOFORTRAN)
|
||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
|
||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
|
||||
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
|
||||
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
|
||||
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
|
||||
|
@ -327,10 +342,11 @@ endif()
|
|||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
|
|
2
Makefile
2
Makefile
|
@ -96,7 +96,7 @@ endif
|
|||
@echo
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED), 1)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
|
|
|
@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99)
|
|||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
|
|
|
@ -58,14 +58,14 @@ ifndef NO_LAPACKE
|
|||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
|
@ -106,14 +106,14 @@ ifndef NO_LAPACKE
|
|||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
|
@ -138,7 +138,7 @@ endif
|
|||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
|
|
@ -48,6 +48,8 @@ VERSION = 0.3.6.dev
|
|||
# HOSTCC = gcc
|
||||
|
||||
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
|
||||
# Please note that AVX is not available on 32-bit.
|
||||
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
|
||||
# BINARY=64
|
||||
|
||||
# About threaded BLAS. It will be automatically detected if you don't
|
||||
|
@ -57,7 +59,7 @@ VERSION = 0.3.6.dev
|
|||
# USE_THREAD = 0
|
||||
|
||||
# If you're going to use this library with OpenMP, please comment it in.
|
||||
# This flag is always set for POWER8. Don't modify the flag
|
||||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
||||
# USE_OPENMP = 1
|
||||
|
||||
# The OpenMP scheduler to use - by default this is "static" and you
|
||||
|
@ -68,36 +70,45 @@ VERSION = 0.3.6.dev
|
|||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||
|
||||
# You can define maximum number of threads. Basically it should be
|
||||
# less than actual number of cores. If you don't specify one, it's
|
||||
# automatically detected by the the script.
|
||||
# You can define the maximum number of threads. Basically it should be less
|
||||
# than or equal to the number of CPU threads. If you don't specify one, it's
|
||||
# automatically detected by the build system.
|
||||
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
|
||||
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
|
||||
# detection includes logical CPUs, thus allowing the use of SMT.
|
||||
# Users may opt at runtime to use less than NUM_THREADS threads.
|
||||
#
|
||||
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
|
||||
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
|
||||
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
|
||||
# footprint penalty, even if users reduce the actual number of threads at runtime.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# OpenBLAS's calculation API from multiple threads, please comment this in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can actually
|
||||
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# If you don't need to install the static library, please comment this in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
# if you don't need generate the shared library, please comment it in.
|
||||
# If you don't need to generate the shared library, please comment this in.
|
||||
# NO_SHARED = 1
|
||||
|
||||
# If you don't need CBLAS interface, please comment it in.
|
||||
# If you don't need the CBLAS interface, please comment this in.
|
||||
# NO_CBLAS = 1
|
||||
|
||||
# If you only want CBLAS interface without installing Fortran compiler,
|
||||
# please comment it in.
|
||||
# If you only want the CBLAS interface without installing a Fortran compiler,
|
||||
# please comment this in.
|
||||
# ONLY_CBLAS = 1
|
||||
|
||||
# If you don't need LAPACK, please comment it in.
|
||||
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
|
||||
# If you don't need LAPACK, please comment this in.
|
||||
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
|
||||
# NO_LAPACK = 1
|
||||
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
|
@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# If you want to use the legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
|
@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
# are not sure(equivalent to "-i8" option).
|
||||
# compilers support this. It's safe to keep this commented out if you
|
||||
# are not sure. (This is equivalent to the "-i8" ifort option).
|
||||
# INTERFACE64 = 1
|
||||
|
||||
# Unfortunately most of kernel won't give us high quality buffer.
|
||||
|
@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# but it will consume time. If you don't like it, you can disable one.
|
||||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
|
||||
# This feature is only implemented on Linux, and is always disabled on other platforms.
|
||||
# Enabling affinity handling may improve performance, especially on NUMA systems, but
|
||||
# it may conflict with certain applications that also try to manage affinity.
|
||||
# This conflict can result in threads of the application calling OpenBLAS ending up locked
|
||||
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
|
||||
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
|
||||
# else modifies affinity settings.
|
||||
# Note: enabling affinity has been known to cause problems with NumPy and R
|
||||
NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
||||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
|
||||
|
@ -180,7 +199,7 @@ NO_AFFINITY = 1
|
|||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
# If you need sanity check by comparing results to reference BLAS. It'll be very
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
|
|
|
@ -95,6 +95,9 @@ endif
|
|||
ifeq ($(TARGET), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET), ARMV8)
|
||||
GETARCH_FLAGS := -DFORCE_ARMV7
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -152,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX
|
|||
endif
|
||||
|
||||
ifeq ($(BINARY), 32)
|
||||
GETARCH_FLAGS += -DNO_AVX
|
||||
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
|
||||
NO_AVX512 = 1
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX2), 1)
|
||||
|
|
|
@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
|
|||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
|
|
@ -91,7 +91,9 @@ CORTEXA73
|
|||
FALKOR
|
||||
THUNDERX
|
||||
THUNDERX2T99
|
||||
TSV110
|
||||
|
||||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
Z13
|
||||
Z14
|
||||
|
|
|
@ -53,9 +53,9 @@ before_build:
|
|||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||
- cd build
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
|||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
|
@ -27,29 +28,21 @@ if (p != "") {
|
|||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
A <- matrix(rnorm(n * n), nrow = n)
|
||||
ev <- 0
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
ev <- eigen(A)
|
||||
})
|
||||
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
|||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
|
@ -27,26 +28,13 @@ if (p != "") {
|
|||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
B <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
A <- matrix(runif(n * n), nrow = n)
|
||||
B <- matrix(runif(n * n), nrow = n)
|
||||
C <- 1
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
|
@ -54,11 +42,10 @@ while (n <= nto) {
|
|||
l <- l + 1
|
||||
})
|
||||
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
|||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
|
@ -27,31 +28,22 @@ if (p != "") {
|
|||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
A <- matrix(rnorm(n * n), nrow = n)
|
||||
B <- matrix(rnorm(n * n), nrow = n)
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
solve(A, B)
|
||||
})
|
||||
|
||||
mflops <-
|
||||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
|
45
c_check
45
c_check
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use File::Basename;
|
||||
use File::Temp qw(tempfile);
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
|
|||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
#$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
|
@ -31,12 +31,25 @@ if ($?) {
|
|||
|
||||
$cross_suffix = "";
|
||||
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
eval "use File::Basename";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Basename, emulating its functionality";
|
||||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
|
||||
if ($dirnam ne ".") {
|
||||
$cross_suffix .= $dirnam . "/";
|
||||
}
|
||||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
|
||||
if ($basnam =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
} else {
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
}
|
||||
|
||||
$compiler = "";
|
||||
|
@ -171,6 +184,11 @@ if ($?) {
|
|||
|
||||
$have_msa = 0;
|
||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
|
@ -185,6 +203,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
$have_msa = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
|
@ -204,9 +223,16 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
|
|||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
|
@ -215,6 +241,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
|||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
|
|
@ -74,6 +74,9 @@ if (DYNAMIC_ARCH)
|
|||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
|
|
|
@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
|||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
|
@ -184,6 +187,13 @@ if (DYNAMIC_ARCH)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_LIST)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
|
||||
foreach(DCORE ${DYNAMIC_LIST})
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
|
||||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if (NO_LAPACK)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
|
||||
#Disable LAPACK C interface
|
||||
|
|
|
@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
|||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
||||
|
@ -78,7 +82,7 @@ endif()
|
|||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
|
|
2
common.h
2
common.h
|
@ -444,7 +444,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) 0
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
#else
|
||||
|
|
|
@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
@ -598,9 +598,14 @@ REALNAME:;\
|
|||
#ifndef __64BIT__
|
||||
#define PROLOGUE \
|
||||
.machine "any";\
|
||||
.toc;\
|
||||
.globl .REALNAME;\
|
||||
.globl REALNAME;\
|
||||
.csect REALNAME[DS],3;\
|
||||
REALNAME:;\
|
||||
.long .REALNAME, TOC[tc0], 0;\
|
||||
.csect .text[PR],5;\
|
||||
.REALNAME:;
|
||||
.REALNAME:
|
||||
|
||||
#define EPILOGUE \
|
||||
_section_.text:;\
|
||||
|
@ -611,9 +616,14 @@ _section_.text:;\
|
|||
|
||||
#define PROLOGUE \
|
||||
.machine "any";\
|
||||
.toc;\
|
||||
.globl .REALNAME;\
|
||||
.globl REALNAME;\
|
||||
.csect REALNAME[DS],3;\
|
||||
REALNAME:;\
|
||||
.llong .REALNAME, TOC[tc0], 0;\
|
||||
.csect .text[PR], 5;\
|
||||
.REALNAME:;
|
||||
.REALNAME:
|
||||
|
||||
#define EPILOGUE \
|
||||
_section_.text:;\
|
||||
|
|
|
@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
|
||||
|
||||
return result;
|
||||
#endif
|
||||
|
|
|
@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -39,6 +39,8 @@
|
|||
// Cavium
|
||||
#define CPU_THUNDERX 7
|
||||
#define CPU_THUNDERX2T99 8
|
||||
//Hisilicon
|
||||
#define CPU_TSV110 9
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
@ -49,7 +51,8 @@ static char *cpuname[] = {
|
|||
"CORTEXA73",
|
||||
"FALKOR",
|
||||
"THUNDERX",
|
||||
"THUNDERX2T99"
|
||||
"THUNDERX2T99",
|
||||
"TSV110"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
@ -61,7 +64,8 @@ static char *cpuname_lower[] = {
|
|||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99"
|
||||
"thunderx2t99",
|
||||
"tsv110"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -145,6 +149,9 @@ int detect(void)
|
|||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||
return CPU_THUNDERX2T99;
|
||||
// HiSilicon
|
||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||
return CPU_TSV110;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
|
@ -286,6 +293,21 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -228,7 +228,7 @@ int support_avx2(){
|
|||
}
|
||||
|
||||
int support_avx512(){
|
||||
#ifndef NO_AVX512
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
|
@ -1359,6 +1359,8 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
// Apollo Lake
|
||||
case 15:
|
||||
// Denverton
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
@ -1378,7 +1380,7 @@ int get_cpuname(void){
|
|||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 14: // Kaby Lake
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
|
|
|
@ -64,10 +64,8 @@ int detect(void)
|
|||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
@ -116,7 +114,14 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define L1_DATA_SIZE 131072\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8\n");
|
||||
printf("#define L2_SIZE 4194304\n");
|
||||
printf("#define L2_LINESIZE 256\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
2
ctest.c
2
ctest.c
|
@ -113,7 +113,7 @@ ARCH_X86
|
|||
ARCH_X86_64
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
|
||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
|
||||
ARCH_POWER
|
||||
#endif
|
||||
|
||||
|
|
|
@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
|
||||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
|
||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = trmv_kernel;
|
||||
|
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
|
||||
range_m[num_cpu + 1] = range_m[num_cpu] + width;
|
||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = trmv_kernel;
|
||||
|
|
|
@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
SetEvent(pool.killed);
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
#endif
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
CloseHandle(pool.filled);
|
||||
CloseHandle(pool.killed);
|
||||
|
||||
blas_server_avail = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -322,7 +322,7 @@ int support_avx2(){
|
|||
}
|
||||
|
||||
int support_avx512(){
|
||||
#ifndef NO_AVX512
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
|
@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake
|
||||
if (model == 12) {
|
||||
//Apollo Lake or Denverton
|
||||
if (model == 12 || model == 15) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
|
|
|
@ -198,10 +198,18 @@ int get_num_procs(void);
|
|||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
|
@ -216,27 +224,42 @@ int i,n;
|
|||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -1290,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
|
|||
free(map_address);
|
||||
}
|
||||
|
||||
#ifdef SMP
|
||||
void blas_thread_memory_cleanup(void) {
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void blas_shutdown(void){
|
||||
#ifdef SMP
|
||||
BLASFUNC(blas_thread_shutdown)();
|
||||
|
@ -1299,7 +1329,7 @@ void blas_shutdown(void){
|
|||
/* Only cleanupIf we were built for threading and TLS was initialized */
|
||||
if (local_storage_key)
|
||||
#endif
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
blas_thread_memory_cleanup();
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
base_address = 0UL;
|
||||
|
@ -1529,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
|
|||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
#if defined(SMP)
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
blas_thread_memory_cleanup();
|
||||
#endif
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
|
@ -1603,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#endif
|
||||
|
||||
#else
|
||||
/* USE_TLS / COMPILE_TLS not set */
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||
#define ALLOC_WINDOWS
|
||||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
|
@ -1619,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#ifndef OS_WINDOWS
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
@ -1639,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
@ -1678,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
|
||||
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
||||
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
||||
#else
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
|
@ -1704,11 +1739,20 @@ void goto_set_num_threads(int num_threads) {};
|
|||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
|
@ -1723,26 +1767,42 @@ int i,n;
|
|||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
nums = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -1793,7 +1853,7 @@ int get_num_procs(void) {
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
|
@ -1870,7 +1930,7 @@ void openblas_fork_handler()
|
|||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
|
@ -1883,7 +1943,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
@ -1891,11 +1951,11 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
blas_goto_num = 0;
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
@ -1907,7 +1967,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
|
@ -1915,7 +1975,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
@ -2002,11 +2062,15 @@ static void *alloc_mmap(void *address){
|
|||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
|
@ -2148,14 +2212,18 @@ static void *alloc_mmap(void *address){
|
|||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
}
|
||||
#endif
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
}
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -2523,7 +2591,7 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
int position;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int mypos;
|
||||
int mypos = 0;
|
||||
#endif
|
||||
|
||||
void *map_address;
|
||||
|
@ -2554,6 +2622,11 @@ void *blas_memory_alloc(int procpos){
|
|||
NULL,
|
||||
};
|
||||
void *(**func)(void *address);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory_initialized) {
|
||||
#endif
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
|
@ -2589,6 +2662,9 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#if defined(USE_OPENMP)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Alloc Start ...\n");
|
||||
|
@ -2603,13 +2679,17 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
// blas_lock(&memory[position].lock);
|
||||
|
||||
#else
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
// blas_unlock(&memory[position].lock);
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
position ++;
|
||||
|
@ -2621,21 +2701,26 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
position = 0;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
/* if (!memory[position].used) { */
|
||||
/* blas_lock(&memory[position].lock);*/
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory[position].used) {
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
/* } */
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
blas_unlock(&memory[position].lock);
|
||||
}
|
||||
#endif
|
||||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
goto error;
|
||||
|
||||
allocation :
|
||||
|
@ -2645,10 +2730,11 @@ void *blas_memory_alloc(int procpos){
|
|||
#endif
|
||||
|
||||
memory[position].used = 1;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].addr) {
|
||||
do {
|
||||
#ifdef DEBUG
|
||||
|
@ -2693,9 +2779,13 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
memory[position].addr = map_address;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
||||
|
@ -2749,8 +2839,9 @@ void blas_memory_free(void *free_area){
|
|||
#endif
|
||||
|
||||
position = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
|
@ -2764,7 +2855,9 @@ void blas_memory_free(void *free_area){
|
|||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap Succeeded.\n\n");
|
||||
|
@ -2779,8 +2872,9 @@ void blas_memory_free(void *free_area){
|
|||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,6 +141,14 @@ else
|
|||
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else
|
||||
|
||||
ifneq ($(C_COMPILER), LSB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
|
@ -152,6 +160,7 @@ else
|
|||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
endif
|
||||
endif
|
||||
rm -f linktest
|
||||
|
||||
|
|
|
@ -40,15 +40,25 @@
|
|||
|
||||
void gotoblas_init(void);
|
||||
void gotoblas_quit(void);
|
||||
#if defined(SMP) && defined(USE_TLS)
|
||||
void blas_thread_memory_cleanup(void);
|
||||
#endif
|
||||
|
||||
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
|
||||
|
||||
if (reason == DLL_PROCESS_ATTACH) {
|
||||
switch(reason) {
|
||||
case DLL_PROCESS_ATTACH:
|
||||
gotoblas_init();
|
||||
}
|
||||
|
||||
if (reason == DLL_PROCESS_DETACH) {
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
gotoblas_quit();
|
||||
break;
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
#if defined(SMP) && defined(USE_TLS)
|
||||
blas_thread_memory_cleanup();
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
|
46
getarch.c
46
getarch.c
|
@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
#else
|
||||
#define NO_AVX512
|
||||
#endif
|
||||
/* #define FORCE_P2 */
|
||||
/* #define FORCE_KATMAI */
|
||||
/* #define FORCE_COPPERMINE */
|
||||
|
@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
|
@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "skylakex"
|
||||
#define CORENAME "SKYLAKEX"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
|
@ -1058,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_TSV110
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "TSV110"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTSV110 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "tsv110"
|
||||
#define CORENAME "TSV110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
|
@ -1078,6 +1114,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "Z13"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_Z14
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
#define SUBARCHITECTURE "Z14"
|
||||
#define ARCHCONFIG "-DZ14 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64"
|
||||
#define LIBNAME "z14"
|
||||
#define CORENAME "Z14"
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
|
|
|
@ -218,10 +218,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
#ifdef SMP
|
||||
/* nthreads = num_cpu_avail(2);
|
||||
|
||||
FIXME trmv_thread was found to be broken, see issue 1332 */
|
||||
nthreads = 1;
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
@ -81,6 +81,12 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define SMP_FACTOR 256
|
||||
#else
|
||||
#define SMP_FACTOR 128
|
||||
#endif
|
||||
|
||||
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
#ifndef TRMM
|
||||
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
|
||||
|
@ -366,10 +372,14 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
mode |= (trans << BLAS_TRANSA_SHIFT);
|
||||
mode |= (side << BLAS_RSIDE_SHIFT);
|
||||
|
||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
/*
|
||||
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
*/
|
||||
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
|
|
@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
} else
|
||||
nthreads = 1;
|
||||
|
||||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
ifeq ($(CORE), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > minf )
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
|
|
|
@ -0,0 +1,175 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > minf )
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
|
|
|
@ -129,7 +129,7 @@ LL(12):
|
|||
STFD f0, 14 * SIZE(CO1)
|
||||
STFD f0, 15 * SIZE(CO1)
|
||||
|
||||
dcbst PRE, CO1
|
||||
dcbtst PRE, CO1
|
||||
addi CO1, CO1, 16 * SIZE
|
||||
bdnz LL(12)
|
||||
.align 4
|
||||
|
|
|
@ -134,7 +134,7 @@ LL(12):
|
|||
STFD f0, 14 * SIZE(CO1)
|
||||
STFD f0, 15 * SIZE(CO1)
|
||||
|
||||
dcbst PRE, CO1
|
||||
dcbtst PRE, CO1
|
||||
addi CO1, CO1, 16 * SIZE
|
||||
bdnz LL(12)
|
||||
.align 4
|
||||
|
|
|
@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
|
|
@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
|
|
@ -96,9 +96,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
|
|
@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
|
|
@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
@ -176,9 +176,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -106,9 +106,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
@ -176,9 +176,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
|
|
@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", // "0", "1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
|
@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
|
@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
|
|
@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -209,11 +209,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
|
|
@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -60,9 +60,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -79,9 +79,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
@ -141,9 +141,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -100,9 +100,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -79,9 +79,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
@ -141,9 +141,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -66,9 +66,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -76,9 +76,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"movsd %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -82,9 +82,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
@ -146,9 +146,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -82,9 +82,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -79,9 +79,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
|
@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
|
|
|
@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
|
|
|
@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
|
||||
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
|
||||
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
|
||||
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
|
||||
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
|
||||
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
|
@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %2 \n\t"
|
||||
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
||||
"subq $8 , %1 \n\t"
|
||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
|
@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
|
|
@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"movsd %%xmm11,8(%2) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
|
@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"movsd %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
|
@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
|
|
|
@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (n) // 0
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
|
|
|
@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
|
|
@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
|
|
@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
|
|
@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
|
|
@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
|
|
@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
|
|
@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
|
|
@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
|
|
@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
|
|
@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
|
|
@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
|
|
@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" cmpq $0, %0 \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
||||
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
||||
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
||||
|
||||
|
||||
" addq $8, %1 \n\t"
|
||||
|
@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" .p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
|
||||
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
|
||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
||||
|
||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
|
||||
|
||||
" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
|
||||
" vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
|
||||
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
|
||||
|
||||
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
|
||||
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
|
||||
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
|
||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
|
||||
|
@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
|
||||
" jz 22f \n\t"
|
||||
|
||||
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
||||
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
||||
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
|
||||
|
||||
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
|
||||
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
|
||||
|
||||
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
|
||||
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
|
||||
|
||||
|
@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
|
||||
|
||||
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
|
||||
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
|
||||
|
@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
|
||||
|
||||
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
|
||||
" vmovups 32(%9), %%ymm4 \n\t"
|
||||
" vmovups 32(%3), %%ymm4 \n\t"
|
||||
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
|
||||
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
|
||||
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
|
||||
|
@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
|
||||
"5: \n\t" // i = 0
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
|
||||
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups %%ymm8 , (%8) \n\t" // write a
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vmovups %%ymm8 , (%2) \n\t" // write a
|
||||
" vmovups %%ymm8 , (%4) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
|
||||
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
|
||||
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
|
||||
|
@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
|
||||
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm9 , (%8) \n\t" // write a
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm9 , (%2) \n\t" // write a
|
||||
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
|
||||
|
@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm10, (%8) \n\t" // write a
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm10, (%2) \n\t" // write a
|
||||
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
|
||||
|
@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
|
||||
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm11, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm11, (%2) \n\t" // write a
|
||||
" vmovups %%ymm11, (%5) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
|
||||
|
@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm12, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm12, (%2) \n\t" // write a
|
||||
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
|
||||
|
@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm13, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm13, (%2) \n\t" // write a
|
||||
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
|
||||
|
@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm14, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm14, (%2) \n\t" // write a
|
||||
" vmovups %%ymm14, (%6) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
|
||||
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
|
||||
" vmovups %%ymm15, (%8) \n\t" // write a
|
||||
" vmovups %%ymm15, (%2) \n\t" // write a
|
||||
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
|
||||
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c3), // 5
|
||||
"r" (c6), // 6
|
||||
"r" (ldc), // 7
|
||||
"r" (as), // 8
|
||||
"r" (bs) // 9
|
||||
"r" (a), // 8
|
||||
"r" (b) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
|
|
@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" .align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" prefetcht0 384(%3,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" prefetcht0 384(%7,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
|
||||
" jz 2f \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
|
||||
" jz 2f \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
|
||||
" jz 2f \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
|
||||
"3: \n\t" // i = 1
|
||||
|
||||
" vmovddup (%7), %%xmm1 \n\t" // read b
|
||||
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
|
||||
" vmovddup (%3), %%xmm1 \n\t" // read b
|
||||
" vmovddup 8(%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
|
||||
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
|
||||
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
|
||||
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
|
||||
|
||||
" vmovups %%xmm12 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%6) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%6) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%6) \n\t" // write a
|
||||
" vmovups %%xmm12 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%2) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%2) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%2) \n\t" // write a
|
||||
|
||||
" vmovups %%xmm12 , (%5) \n\t" // write c1
|
||||
" vmovups %%xmm13 , 16(%5) \n\t"
|
||||
|
@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
|
||||
|
||||
" \n\t" // i = 0
|
||||
" subq $16 , %7 \n\t" // b = b - 2
|
||||
" subq $64 , %6 \n\t" // a = a - 8
|
||||
" subq $16 , %3 \n\t" // b = b - 2
|
||||
" subq $64 , %2 \n\t" // a = a - 8
|
||||
|
||||
" vmovddup (%7), %%xmm0 \n\t" // read bb
|
||||
" vmovddup (%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
|
||||
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
|
||||
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
|
||||
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%6) \n\t"
|
||||
" vmovups %%xmm10 , 32(%6) \n\t"
|
||||
" vmovups %%xmm11 , 48(%6) \n\t"
|
||||
" vmovups %%xmm8 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%2) \n\t"
|
||||
" vmovups %%xmm10 , 32(%2) \n\t"
|
||||
" vmovups %%xmm11 , 48(%2) \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%4) \n\t" // write c0
|
||||
" vmovups %%xmm9 , 16(%4) \n\t"
|
||||
|
@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (as), // 6
|
||||
"r" (bs) // 7
|
||||
"r" (a), // 6
|
||||
"r" (b) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
|
|
@ -135,7 +135,7 @@
|
|||
#endif
|
||||
|
||||
movq %rsp, %rbx # save old stack
|
||||
subq $128 + LOCAL_BUFFER_SIZE, %rsp
|
||||
subq $256 + LOCAL_BUFFER_SIZE, %rsp
|
||||
andq $-4096, %rsp # align stack
|
||||
|
||||
STACK_TOUCHING
|
||||
|
|
|
@ -383,7 +383,7 @@
|
|||
EMMS
|
||||
|
||||
movq %rsp, %rbx # save old stack
|
||||
subq $128 + LOCAL_BUFFER_SIZE, %rsp
|
||||
subq $256 + LOCAL_BUFFER_SIZE, %rsp
|
||||
andq $-4096, %rsp # align stack
|
||||
|
||||
STACK_TOUCHING
|
||||
|
|
|
@ -60,9 +60,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -79,9 +79,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
@ -140,9 +140,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -100,9 +100,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
|
|
@ -67,9 +67,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -80,9 +80,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -76,9 +76,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"movss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -83,9 +83,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -81,9 +81,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
@ -144,9 +144,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
|
|
@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
|
@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
|
||||
"3: \n\t"
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n1) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
|
@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (src), // 2
|
||||
"r" (dest) // 3
|
||||
: "cc",
|
||||
|
|
|
@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastss (%2), %%xmm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
|
||||
"vbroadcastss (%3), %%xmm12 \n\t" // x0
|
||||
"vbroadcastss 4(%3), %%xmm13 \n\t" // x1
|
||||
"vbroadcastss 8(%3), %%xmm14 \n\t" // x2
|
||||
"vbroadcastss 12(%3), %%xmm15 \n\t" // x3
|
||||
"vbroadcastss 16(%3), %%xmm0 \n\t" // x4
|
||||
"vbroadcastss 20(%3), %%xmm1 \n\t" // x5
|
||||
"vbroadcastss 24(%3), %%xmm2 \n\t" // x6
|
||||
"vbroadcastss 28(%3), %%xmm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||
|
||||
|
@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"addq $4 , %8 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
|
||||
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
|
||||
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
|
||||
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %2 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
||||
|
@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
|
||||
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%8,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
".align 2 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%8,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
|
||||
"addq $16, %8 \n\t"
|
||||
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
|
||||
"addq $16, %2 \n\t"
|
||||
"vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
|
|
@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
|
@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
"vbroadcastss (%3), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
|
@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"testq $0x08, %1 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %2 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $16, %8 \n\t"
|
||||
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
|
||||
"addq $16, %2 \n\t"
|
||||
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
|
||||
"subq $16, %1 \n\t"
|
||||
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
|
@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
}
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
|
@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
|
|
|
@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2), %%xmm12 \n\t" // x0
|
||||
"movss 4(%2), %%xmm13 \n\t" // x1
|
||||
"movss 8(%2), %%xmm14 \n\t" // x2
|
||||
"movss 12(%2), %%xmm15 \n\t" // x3
|
||||
"movss (%3), %%xmm12 \n\t" // x0
|
||||
"movss 4(%3), %%xmm13 \n\t" // x1
|
||||
"movss 8(%3), %%xmm14 \n\t" // x2
|
||||
"movss 12(%3), %%xmm15 \n\t" // x3
|
||||
"shufps $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufps $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufps $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
"movss 16(%2), %%xmm0 \n\t" // x4
|
||||
"movss 20(%2), %%xmm1 \n\t" // x5
|
||||
"movss 24(%2), %%xmm2 \n\t" // x6
|
||||
"movss 28(%2), %%xmm3 \n\t" // x7
|
||||
"movss 16(%3), %%xmm0 \n\t" // x4
|
||||
"movss 20(%3), %%xmm1 \n\t" // x5
|
||||
"movss 24(%3), %%xmm2 \n\t" // x6
|
||||
"movss 28(%3), %%xmm3 \n\t" // x7
|
||||
"shufps $0, %%xmm0 , %%xmm0 \n\t"
|
||||
"shufps $0, %%xmm1 , %%xmm1 \n\t"
|
||||
"shufps $0, %%xmm2 , %%xmm2 \n\t"
|
||||
|
@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"1: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
".p2align 1 \n\t"
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
"movups (%5,%0,4), %%xmm9 \n\t"
|
||||
"movups (%6,%0,4), %%xmm10 \n\t"
|
||||
"movups (%7,%0,4), %%xmm11 \n\t"
|
||||
"movups (%5,%0,4), %%xmm8 \n\t"
|
||||
"movups (%6,%0,4), %%xmm9 \n\t"
|
||||
"movups (%7,%0,4), %%xmm10 \n\t"
|
||||
"movups (%8,%0,4), %%xmm11 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm13, %%xmm9 \n\t"
|
||||
|
@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"movups (%4,%8,4), %%xmm8 \n\t"
|
||||
"movups (%5,%8,4), %%xmm9 \n\t"
|
||||
"movups (%6,%8,4), %%xmm10 \n\t"
|
||||
"movups (%7,%8,4), %%xmm11 \n\t"
|
||||
"movups (%5,%2,4), %%xmm8 \n\t"
|
||||
"movups (%6,%2,4), %%xmm9 \n\t"
|
||||
"movups (%7,%2,4), %%xmm10 \n\t"
|
||||
"movups (%8,%2,4), %%xmm11 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"mulps %%xmm0 , %%xmm8 \n\t"
|
||||
"mulps %%xmm1 , %%xmm9 \n\t"
|
||||
|
@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
"addps %%xmm5 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"mulps %%xmm6 , %%xmm4 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm4 , %%xmm7 \n\t"
|
||||
|
||||
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
"movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
|
|
@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
"vbroadcastss (%3), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
|
@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
"vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
|
||||
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
|
||||
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
|
||||
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
|
||||
"vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
|
||||
"vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
|
||||
"vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
|
||||
"vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
|
@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4, %8 \n\t"
|
||||
"addq $4, %2 \n\t"
|
||||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
|
@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
|
@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8, %8 \n\t"
|
||||
"addq $8, %2 \n\t"
|
||||
"addq $8, %0 \n\t"
|
||||
"subq $8, %1 \n\t"
|
||||
|
||||
|
@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%8,%0,4) \n\t"
|
||||
"vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
"vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%8,4) \n\t"
|
||||
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%8,4) \n\t"
|
||||
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
|
||||
"prefetcht0 192(%5,%2,4) \n\t"
|
||||
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%6,%2,4) \n\t"
|
||||
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%6,%8,4) \n\t"
|
||||
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%7,%8,4) \n\t"
|
||||
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"prefetcht0 192(%7,%2,4) \n\t"
|
||||
"vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
|
||||
"vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%8,%2,4) \n\t"
|
||||
"vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
|
||||
"vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
|
@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
"vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
"vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
|
||||
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $16, %8 \n\t"
|
||||
"addq $16, %2 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
|
|
@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"movss %%xmm11,4(%2) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
|
@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"movss %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
|
@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue