Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop

This commit is contained in:
AbdelRauf 2019-04-29 08:57:44 +00:00
commit 628b335e83
197 changed files with 17904 additions and 7444 deletions

View File

@ -149,7 +149,7 @@ matrix:
- &test-macos
os: osx
osx_image: xcode8.3
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
@ -160,6 +160,7 @@ matrix:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
osx_image: xcode8.3
env:
- BTYPE="BINARY=32"

View File

@ -42,6 +42,19 @@ endif()
#######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
@ -62,10 +75,10 @@ endif ()
set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src)
endif()
list(APPEND SUBDIRS lapack)
endif ()
# set which float types we want to build for
@ -134,7 +147,7 @@ endif ()
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH})
endforeach()
endif ()
# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
# Android needs to explicitly link against libm
if(ANDROID)
@ -166,7 +173,7 @@ endif()
# Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else()
# Creates verbose .def file (51KB vs 18KB)
@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@ -327,10 +342,11 @@ endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT NO_LAPACKE)

View File

@ -96,7 +96,7 @@ endif
@echo
shared :
ifndef NO_SHARED
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so

View File

@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif

View File

@ -58,14 +58,14 @@ ifndef NO_LAPACKE
endif
#for install static library
ifndef NO_STATIC
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@ -106,14 +106,14 @@ ifndef NO_LAPACKE
endif
#for install static library
ifndef NO_STATIC
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@ -138,7 +138,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -48,6 +48,8 @@ VERSION = 0.3.6.dev
# HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# Please note that AVX is not available on 32-bit.
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
# BINARY=64
# About threaded BLAS. It will be automatically detected if you don't
@ -57,7 +59,7 @@ VERSION = 0.3.6.dev
# USE_THREAD = 0
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
# The OpenMP scheduler to use - by default this is "static" and you
@ -68,36 +70,45 @@ VERSION = 0.3.6.dev
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
# You can define the maximum number of threads. Basically it should be less
# than or equal to the number of CPU threads. If you don't specify one, it's
# automatically detected by the build system.
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
# detection includes logical CPUs, thus allowing the use of SMT.
# Users may opt at runtime to use less than NUM_THREADS threads.
#
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
# footprint penalty, even if users reduce the actual number of threads at runtime.
# NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# OpenBLAS's calculation API from multiple threads, please comment this in.
# This flag defines how many instances of OpenBLAS's calculation API can actually
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in.
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1
# if you don't need generate the shared library, please comment it in.
# If you don't need to generate the shared library, please comment this in.
# NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in.
# If you don't need the CBLAS interface, please comment this in.
# NO_CBLAS = 1
# If you only want CBLAS interface without installing Fortran compiler,
# please comment it in.
# If you only want the CBLAS interface without installing a Fortran compiler,
# please comment this in.
# ONLY_CBLAS = 1
# If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# If you don't need LAPACK, please comment this in.
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
# NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0
@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
# If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses
@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1
# USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
# are not sure(equivalent to "-i8" option).
# compilers support this. It's safe to keep this commented out if you
# are not sure. (This is equivalent to the "-i8" ifort option).
# INTERFACE64 = 1
# Unfortunately most of kernel won't give us high quality buffer.
@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1
# but it will consume time. If you don't like it, you can disable one.
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
# This feature is only implemented on Linux, and is always disabled on other platforms.
# Enabling affinity handling may improve performance, especially on NUMA systems, but
# it may conflict with certain applications that also try to manage affinity.
# This conflict can result in threads of the application calling OpenBLAS ending up locked
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
# else modifies affinity settings.
# Note: enabling affinity has been known to cause problems with NumPy and R
NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@ -180,7 +199,7 @@ NO_AFFINITY = 1
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very
# If you need sanity check by comparing results to reference BLAS. It'll be very
# slow (Not implemented yet).
# SANITY_CHECK = 1

View File

@ -95,6 +95,9 @@ endif
ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
endif
@ -152,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
endif
ifeq ($(NO_AVX2), 1)

View File

@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@ -91,7 +91,9 @@ CORTEXA73
FALKOR
THUNDERX
THUNDERX2T99
TSV110
9.System Z:
ZARCH_GENERIC
Z13
Z14

View File

@ -53,9 +53,9 @@ before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
build_script:
- cmake --build .

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,29 +28,21 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,26 +28,13 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
A <- matrix(runif(n * n), nrow = n)
B <- matrix(runif(n * n), nrow = n)
C <- 1
z <- system.time(for (l in 1:loops) {
@ -54,11 +42,10 @@ while (n <= nto) {
l <- l + 1
})
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,31 +28,22 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), nrow = n)
z <- system.time(for (l in 1:loops) {
solve(A, B)
})
mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

85
c_check
View File

@ -1,7 +1,7 @@
#!/usr/bin/perl
use File::Basename;
use File::Temp qw(tempfile);
#use File::Basename;
# use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 );
#$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@ -31,12 +31,25 @@ if ($?) {
$cross_suffix = "";
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
}
$compiler = "";
@ -171,20 +184,26 @@ if ($?) {
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$have_msa = 1;
$tmpf = new File::Temp( UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
unlink("$tmpf.o");
}
$architecture = x86 if ($data =~ /ARCH_X86/);
@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
}
unlink("tmpf.o");
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;

View File

@ -74,6 +74,9 @@ if (DYNAMIC_ARCH)
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)

View File

@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
set(TARGET "ARMV7")
endif ()
endif ()
if (DEFINED TARGET)
@ -184,6 +187,13 @@ if (DYNAMIC_ARCH)
endif ()
endif ()
if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif ()
if (NO_LAPACK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
#Disable LAPACK C interface

View File

@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
@ -78,7 +82,7 @@ endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()

View File

@ -444,7 +444,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else

View File

@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@ -598,9 +598,14 @@ REALNAME:;\
#ifndef __64BIT__
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\
.REALNAME:;
.REALNAME:
#define EPILOGUE \
_section_.text:;\
@ -611,9 +616,14 @@ _section_.text:;\
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\
.REALNAME:;
.REALNAME:
#define EPILOGUE \
_section_.text:;\

View File

@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
return result;
#endif

View File

@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
return result;
}

View File

@ -39,6 +39,8 @@
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9
static char *cpuname[] = {
"UNKNOWN",
@ -49,7 +51,8 @@ static char *cpuname[] = {
"CORTEXA73",
"FALKOR",
"THUNDERX",
"THUNDERX2T99"
"THUNDERX2T99",
"TSV110"
};
static char *cpuname_lower[] = {
@ -61,7 +64,8 @@ static char *cpuname_lower[] = {
"cortexa73",
"falkor",
"thunderx",
"thunderx2t99"
"thunderx2t99",
"tsv110"
};
int get_feature(char *search)
@ -145,6 +149,9 @@ int detect(void)
return CPU_THUNDERX;
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
}
p = (char *) NULL ;
@ -286,6 +293,21 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}

View File

@ -228,7 +228,7 @@ int support_avx2(){
}
int support_avx512(){
#ifndef NO_AVX512
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
@ -1359,6 +1359,8 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM;
}
break;
@ -1378,7 +1380,7 @@ int get_cpuname(void){
case 9:
case 8:
switch (model) {
case 14: // Kaby Lake
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())

View File

@ -27,9 +27,9 @@
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
@ -64,10 +64,8 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
return CPU_GENERIC;
}
@ -116,7 +114,14 @@ void get_cpuconfig(void)
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

View File

@ -113,7 +113,7 @@ ARCH_X86
ARCH_X86_64
#endif
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
ARCH_POWER
#endif

View File

@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;

View File

@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}

View File

@ -322,7 +322,7 @@ int support_avx2(){
}
int support_avx512(){
#ifndef NO_AVX512
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake
if (model == 12) {
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;

View File

@ -198,45 +198,68 @@ int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
}
@ -1290,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
free(map_address);
}
#ifdef SMP
void blas_thread_memory_cleanup(void) {
blas_memory_cleanup((void*)get_memory_table());
}
#endif
void blas_shutdown(void){
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
@ -1299,7 +1329,7 @@ void blas_shutdown(void){
/* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key)
#endif
blas_memory_cleanup((void*)get_memory_table());
blas_thread_memory_cleanup();
#ifdef SEEK_ADDRESS
base_address = 0UL;
@ -1529,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
break;
case DLL_THREAD_DETACH:
#if defined(SMP)
blas_memory_cleanup((void*)get_memory_table());
blas_thread_memory_cleanup();
#endif
break;
case DLL_PROCESS_DETACH:
@ -1603,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) {
#endif
#else
/* USE_TLS / COMPILE_TLS not set */
#include <errno.h>
#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000
@ -1619,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <stdio.h>
#include <fcntl.h>
#ifndef OS_WINDOWS
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
@ -1639,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@ -1678,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) {
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@ -1704,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {};
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp);
return nums;
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
}
@ -1793,7 +1853,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
@ -1870,7 +1930,7 @@ void openblas_fork_handler()
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
@ -1883,7 +1943,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@ -1891,11 +1951,11 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
blas_goto_num = 0;
// blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
@ -1907,7 +1967,7 @@ int blas_get_cpu_number(void){
#endif
blas_omp_num = 0;
// blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
@ -1915,7 +1975,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@ -2002,11 +2062,15 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
#ifdef OS_LINUX
@ -2148,14 +2212,18 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
UNLOCK_COMMAND(&alloc_lock);
return map_address;
}
@ -2523,7 +2591,7 @@ void *blas_memory_alloc(int procpos){
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
int mypos = 0;
#endif
void *map_address;
@ -2554,6 +2622,11 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
@ -2589,6 +2662,9 @@ void *blas_memory_alloc(int procpos){
}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG
printf("Alloc Start ...\n");
@ -2603,13 +2679,17 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
// blas_lock(&memory[position].lock);
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
// blas_unlock(&memory[position].lock);
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
@ -2621,21 +2701,26 @@ void *blas_memory_alloc(int procpos){
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
/* if (!memory[position].used) { */
/* blas_lock(&memory[position].lock);*/
#if defined(USE_OPENMP)
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
/* blas_unlock(&memory[position].lock);*/
/* } */
#if defined(USE_OPENMP)
blas_unlock(&memory[position].lock);
}
#endif
position ++;
} while (position < NUM_BUFFERS);
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;
allocation :
@ -2645,10 +2730,11 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
do {
#ifdef DEBUG
@ -2693,9 +2779,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@ -2749,8 +2839,9 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
@ -2764,7 +2855,9 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@ -2779,8 +2872,9 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
}

View File

@ -141,6 +141,14 @@ else
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
endif
ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
@ -152,6 +160,7 @@ else
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
endif
rm -f linktest

View File

@ -40,15 +40,25 @@
void gotoblas_init(void);
void gotoblas_quit(void);
#if defined(SMP) && defined(USE_TLS)
void blas_thread_memory_cleanup(void);
#endif
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
if (reason == DLL_PROCESS_ATTACH) {
gotoblas_init();
}
if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit();
switch(reason) {
case DLL_PROCESS_ATTACH:
gotoblas_init();
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP) && defined(USE_TLS)
blas_thread_memory_cleanup();
#endif
break;
}
return TRUE;

View File

@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <unistd.h>
#endif
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#define NO_AVX512
#endif
/* #define FORCE_P2 */
/* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */
@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#endif
#ifdef FORCE_ATOM
#define FORCE
@ -1058,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_TSV110
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "TSV110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTSV110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"
@ -1078,6 +1114,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13"
#endif
#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif
#ifndef FORCE
#ifdef USER_TARGET

View File

@ -218,10 +218,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
/* nthreads = num_cpu_avail(2);
FIXME trmv_thread was found to be broken, see issue 1332 */
nthreads = 1;
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif

View File

@ -81,6 +81,12 @@
#endif
#endif
#ifndef COMPLEX
#define SMP_FACTOR 256
#else
#define SMP_FACTOR 128
#endif
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef TRMM
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
/*
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
*/
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);

View File

@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else
nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}

View File

@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
ifeq ($(TARGET), GENERIC)
ifeq ($(CORE), GENERIC)
USE_TRMM = 1
endif
@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch)
USE_TRMM = 1
endif
ifeq ($(CORE), Z14)
USE_TRMM = 1
endif

View File

@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
if( x[ix] > minf )
if( x[ix] < minf )
{
min = i;
minf = x[ix];

175
kernel/arm64/KERNEL.TSV110 Normal file
View File

@ -0,0 +1,175 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
if( x[ix] > minf )
if( x[ix] < minf )
{
min = i;
minf = x[ix];

View File

@ -129,7 +129,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1)
dcbst PRE, CO1
dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4

View File

@ -134,7 +134,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1)
dcbst PRE, CO1
dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4

View File

@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@ -96,9 +96,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@ -176,9 +176,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -106,9 +106,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@ -176,9 +176,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", // "0", "1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -209,11 +209,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -60,9 +60,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -79,9 +79,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@ -141,9 +141,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -100,9 +100,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -79,9 +79,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@ -141,9 +141,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -66,9 +66,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -76,9 +76,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"movsd %%xmm4, (%4) \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -82,9 +82,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@ -146,9 +146,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -82,9 +82,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -79,9 +79,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4

View File

@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",

View File

@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
"jnz 1b \n\t"
@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movsd %%xmm11,8(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movsd %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4

View File

@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
"jnz 1b \n\t"
:
"+r" (n) // 0
:
"r" (n), // 0
"r" (x), // 1
"r" (x1), // 2
"r" (alpha), // 3

View File

@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",

View File

@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",

View File

@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",

View File

@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
"+r" (from) // 0
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
"+r" (from) // 0
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
"+r" (from) // 0
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
"+r" (from) // 0
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t"
" je 4f \n\t"
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" addq $8, %1 \n\t"
@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t"
"1: \n\t"
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
" vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 22f \n\t"
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
" vmovups (%9), %%ymm0 \n\t"
" vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
" vmovups 32(%9), %%ymm4 \n\t"
" vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"5: \n\t" // i = 0
" addq $64, %9 \n\t" // b=b+8
" addq $64, %3 \n\t" // b=b+8
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups %%ymm8 , (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm9 , (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm10, (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm11, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm12, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm13, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm14, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
" addq $32, %8 \n\t" // a=a+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
" vmovups %%ymm15, (%8) \n\t" // write a
" vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c3), // 5
"r" (c6), // 6
"r" (ldc), // 7
"r" (as), // 8
"r" (bs) // 9
"r" (a), // 8
"r" (b) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" prefetcht0 384(%3,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" prefetcht0 384(%7,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 1
" vmovddup (%7), %%xmm1 \n\t" // read b
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm1 \n\t" // read b
" vmovddup 8(%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
" \n\t" // i = 0
" subq $16 , %7 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 8
" subq $16 , %3 \n\t" // b = b - 2
" subq $64 , %2 \n\t" // a = a - 8
" vmovddup (%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@ -135,7 +135,7 @@
#endif
movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCHING

View File

@ -383,7 +383,7 @@
EMMS
movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCHING

View File

@ -60,9 +60,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -79,9 +79,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@ -140,9 +140,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -100,9 +100,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@ -67,9 +67,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -80,9 +80,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -76,9 +76,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"movss %%xmm4, (%4) \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -83,9 +83,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -81,9 +81,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@ -144,9 +144,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"3: \n\t"
:
"+r" (i), // 0
"+r" (n1) // 1
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",

View File

@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
"vbroadcastss (%3), %%xmm12 \n\t" // x0
"vbroadcastss 4(%3), %%xmm13 \n\t" // x1
"vbroadcastss 8(%3), %%xmm14 \n\t" // x2
"vbroadcastss 12(%3), %%xmm15 \n\t" // x3
"vbroadcastss 16(%3), %%xmm0 \n\t" // x4
"vbroadcastss 20(%3), %%xmm1 \n\t" // x5
"vbroadcastss 24(%3), %%xmm2 \n\t" // x6
"vbroadcastss 28(%3), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %2 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
"2: \n\t"
@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"subq $8 , %1 \n\t"
@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"prefetcht0 192(%5,%2,4) \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%6,%2,4) \n\t"
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%7,%2,4) \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%8,%2,4) \n\t"
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t"
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
"addq $16, %8 \n\t"
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
"addq $16, %2 \n\t"
"vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t"
"jz 3f \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $16, %8 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
"addq $16, %2 \n\t"
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
"jnz 1b \n\t"
@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"

View File

@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"movss (%3), %%xmm12 \n\t" // x0
"movss 4(%3), %%xmm13 \n\t" // x1
"movss 8(%3), %%xmm14 \n\t" // x2
"movss 12(%3), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
"movss 16(%2), %%xmm0 \n\t" // x4
"movss 20(%2), %%xmm1 \n\t" // x5
"movss 24(%2), %%xmm2 \n\t" // x6
"movss 28(%2), %%xmm3 \n\t" // x7
"movss 16(%3), %%xmm0 \n\t" // x4
"movss 20(%3), %%xmm1 \n\t" // x5
"movss 24(%3), %%xmm2 \n\t" // x6
"movss 28(%3), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
".p2align 1 \n\t"
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
"movups (%5,%0,4), %%xmm8 \n\t"
"movups (%6,%0,4), %%xmm9 \n\t"
"movups (%7,%0,4), %%xmm10 \n\t"
"movups (%8,%0,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,4), %%xmm8 \n\t"
"movups (%5,%8,4), %%xmm9 \n\t"
"movups (%6,%8,4), %%xmm10 \n\t"
"movups (%7,%8,4), %%xmm11 \n\t"
"movups (%5,%2,4), %%xmm8 \n\t"
"movups (%6,%2,4), %%xmm9 \n\t"
"movups (%7,%2,4), %%xmm10 \n\t"
"movups (%8,%2,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
"movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
"vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
"addq $4, %8 \n\t"
"addq $4, %2 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
"addq $8, %8 \n\t"
"addq $8, %2 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
"prefetcht0 192(%5,%2,4) \n\t"
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%6,%2,4) \n\t"
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"prefetcht0 192(%7,%2,4) \n\t"
"vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%8,%2,4) \n\t"
"vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
"addq $16, %8 \n\t"
"addq $16, %2 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movss %%xmm11,4(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movss %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4

Some files were not shown because too many files have changed in this diff Show More