Merge pull request #2100 from xianyi/develop
Merge develop in preparation of 0.3.6 release
This commit is contained in:
commit
15cb124012
|
@ -149,7 +149,7 @@ matrix:
|
|||
|
||||
- &test-macos
|
||||
os: osx
|
||||
osx_image: xcode8
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
|
@ -160,6 +160,7 @@ matrix:
|
|||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 5)
|
||||
set(OpenBLAS_PATCH_VERSION 6)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
@ -42,6 +42,19 @@ endif()
|
|||
|
||||
#######
|
||||
|
||||
if(MSVC AND MSVC_STATIC_CRT)
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
||||
|
@ -62,10 +75,10 @@ endif ()
|
|||
|
||||
set(SUBDIRS ${BLASDIRS})
|
||||
if (NOT NO_LAPACK)
|
||||
list(APPEND SUBDIRS lapack)
|
||||
if(BUILD_RELAPACK)
|
||||
list(APPEND SUBDIRS relapack/src)
|
||||
endif()
|
||||
list(APPEND SUBDIRS lapack)
|
||||
endif ()
|
||||
|
||||
# set which float types we want to build for
|
||||
|
@ -134,7 +147,7 @@ endif ()
|
|||
|
||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
||||
if(MSVC)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
||||
endif()
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
||||
|
@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH})
|
|||
endforeach()
|
||||
endif ()
|
||||
|
||||
# Only build shared libs for MSVC
|
||||
if (MSVC)
|
||||
set(BUILD_SHARED_LIBS ON)
|
||||
endif()
|
||||
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
|
@ -166,7 +173,7 @@ endif()
|
|||
|
||||
# Handle MSVC exports
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||
else()
|
||||
# Creates verbose .def file (51KB vs 18KB)
|
||||
|
@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
|||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||
else()
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
|
@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
|||
if(NOT NOFORTRAN)
|
||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
|
||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
|
||||
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
|
||||
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
|
||||
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
|
||||
|
@ -327,10 +342,11 @@ endif()
|
|||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
|
|
|
@ -1,4 +1,82 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.6
|
||||
29-Apr-2019
|
||||
|
||||
common:
|
||||
* the build tools now check that a given cpu TARGET is actually valid
|
||||
* the build-time check of system features (c_check) has been made
|
||||
less dependent on particular perl features (this should mainly
|
||||
benefit building on Windows)
|
||||
* several problem with the ReLAPACK integration were fixed,
|
||||
including INTERFACE64 support and building a shared library
|
||||
* building with CMAKE on BSD systems was improved
|
||||
* a non-absolute SUM function was added based on the
|
||||
existing optimized code for ASUM
|
||||
* CBLAS interfaces to the IxMIN and IxMAX functions were added
|
||||
* a name clash between LAPACKE and BOOST headers was resolved
|
||||
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
|
||||
kernels
|
||||
* a crash on thread (key) deletion with the USE_TLS=1 memory management
|
||||
option was fixed
|
||||
* restored several earlier fixes, in particular for OpenMP performance,
|
||||
building on BSD, and calling fork on CYGWIN, which had inadvertently
|
||||
been dropped in the 0.3.3 rewrite of the memory management code.
|
||||
|
||||
x86_64:
|
||||
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
|
||||
* building with old versions of MSVC was fixed
|
||||
* it is now possible to build a static library on Windows with CMAKE
|
||||
* accessing environment variables on CYGWIN at run time was fixed
|
||||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
|
||||
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
|
||||
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
|
||||
with CMAKE as well
|
||||
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
|
||||
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
|
||||
* assembly bugs involving undeclared modification of input operands were fixed
|
||||
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
|
||||
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
|
||||
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
|
||||
* a similar bug was fixed in the blas_quickdivide code used to split workloads
|
||||
in most functions
|
||||
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
|
||||
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
|
||||
environment does not support AVX512
|
||||
* improved GEMM performance on ZEN targets
|
||||
|
||||
x86:
|
||||
* build failures caused by the recently added checks for AVX512 were fixed
|
||||
* an inline assembly bug involving undeclared modification of an input argument was
|
||||
fixed in the blas_quickdivide code used to split workloads in most functions
|
||||
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
|
||||
|
||||
MIPS32:
|
||||
* a bug in the IMIN implementation made it return the result of IMAX
|
||||
|
||||
POWER:
|
||||
* single precision BLAS1/2 functions have received optimized POWER8 kernels
|
||||
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
|
||||
* building on PPC970 systems under OSX Leopard or Tiger is now supported
|
||||
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
|
||||
* building a shared library on AIX is now supported for POWER6
|
||||
* DYNAMIC_ARCH support has been added for POWER6 and newer
|
||||
|
||||
ARMv7:
|
||||
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||
* a bug in the IMIN implementation made it return the result of IMAX
|
||||
|
||||
ARMv8:
|
||||
* added support for HiSilicon TSV110 cpus
|
||||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
|
||||
* cross-compilation with CMAKE now works again
|
||||
* a bug in the IMIN implementation made it return the result of IMAX
|
||||
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
|
||||
|
||||
IBM Z:
|
||||
* optimized microkernels for single precicion BLAS1/2 functions have been added
|
||||
for both Z13 and Z14
|
||||
|
||||
====================================================================
|
||||
Version 0.3.5
|
||||
31-Dec-2018
|
||||
|
|
2
Makefile
2
Makefile
|
@ -96,7 +96,7 @@ endif
|
|||
@echo
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED), 1)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
|
|
|
@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99)
|
|||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
|
|
|
@ -58,14 +58,14 @@ ifndef NO_LAPACKE
|
|||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
|
@ -106,14 +106,14 @@ ifndef NO_LAPACKE
|
|||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
|
@ -138,7 +138,7 @@ endif
|
|||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
|
|
@ -9,7 +9,15 @@ else
|
|||
USE_OPENMP = 1
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.5
|
||||
VERSION = 0.3.6
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -48,6 +48,8 @@ VERSION = 0.3.5
|
|||
# HOSTCC = gcc
|
||||
|
||||
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
|
||||
# Please note that AVX is not available on 32-bit.
|
||||
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
|
||||
# BINARY=64
|
||||
|
||||
# About threaded BLAS. It will be automatically detected if you don't
|
||||
|
@ -57,7 +59,7 @@ VERSION = 0.3.5
|
|||
# USE_THREAD = 0
|
||||
|
||||
# If you're going to use this library with OpenMP, please comment it in.
|
||||
# This flag is always set for POWER8. Don't modify the flag
|
||||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
||||
# USE_OPENMP = 1
|
||||
|
||||
# The OpenMP scheduler to use - by default this is "static" and you
|
||||
|
@ -68,36 +70,45 @@ VERSION = 0.3.5
|
|||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||
|
||||
# You can define maximum number of threads. Basically it should be
|
||||
# less than actual number of cores. If you don't specify one, it's
|
||||
# automatically detected by the the script.
|
||||
# You can define the maximum number of threads. Basically it should be less
|
||||
# than or equal to the number of CPU threads. If you don't specify one, it's
|
||||
# automatically detected by the build system.
|
||||
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
|
||||
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
|
||||
# detection includes logical CPUs, thus allowing the use of SMT.
|
||||
# Users may opt at runtime to use less than NUM_THREADS threads.
|
||||
#
|
||||
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
|
||||
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
|
||||
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
|
||||
# footprint penalty, even if users reduce the actual number of threads at runtime.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# OpenBLAS's calculation API from multiple threads, please comment this in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can actually
|
||||
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# If you don't need to install the static library, please comment this in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
# if you don't need generate the shared library, please comment it in.
|
||||
# If you don't need to generate the shared library, please comment this in.
|
||||
# NO_SHARED = 1
|
||||
|
||||
# If you don't need CBLAS interface, please comment it in.
|
||||
# If you don't need the CBLAS interface, please comment this in.
|
||||
# NO_CBLAS = 1
|
||||
|
||||
# If you only want CBLAS interface without installing Fortran compiler,
|
||||
# please comment it in.
|
||||
# If you only want the CBLAS interface without installing a Fortran compiler,
|
||||
# please comment this in.
|
||||
# ONLY_CBLAS = 1
|
||||
|
||||
# If you don't need LAPACK, please comment it in.
|
||||
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
|
||||
# If you don't need LAPACK, please comment this in.
|
||||
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
|
||||
# NO_LAPACK = 1
|
||||
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
|
@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# If you want to use the legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
|
@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
# are not sure(equivalent to "-i8" option).
|
||||
# compilers support this. It's safe to keep this commented out if you
|
||||
# are not sure. (This is equivalent to the "-i8" ifort option).
|
||||
# INTERFACE64 = 1
|
||||
|
||||
# Unfortunately most of kernel won't give us high quality buffer.
|
||||
|
@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# but it will consume time. If you don't like it, you can disable one.
|
||||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
|
||||
# This feature is only implemented on Linux, and is always disabled on other platforms.
|
||||
# Enabling affinity handling may improve performance, especially on NUMA systems, but
|
||||
# it may conflict with certain applications that also try to manage affinity.
|
||||
# This conflict can result in threads of the application calling OpenBLAS ending up locked
|
||||
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
|
||||
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
|
||||
# else modifies affinity settings.
|
||||
# Note: enabling affinity has been known to cause problems with NumPy and R
|
||||
NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
||||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
|
||||
|
@ -180,7 +199,7 @@ NO_AFFINITY = 1
|
|||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
# If you need sanity check by comparing results to reference BLAS. It'll be very
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
|
|
|
@ -65,6 +65,7 @@ endif
|
|||
|
||||
ifdef TARGET
|
||||
GETARCH_FLAGS := -DFORCE_$(TARGET)
|
||||
GETARCH_FLAGS += -DUSER_TARGET
|
||||
endif
|
||||
|
||||
# Force fallbacks for 32bit
|
||||
|
@ -94,6 +95,9 @@ endif
|
|||
ifeq ($(TARGET), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET), ARMV8)
|
||||
GETARCH_FLAGS := -DFORCE_ARMV7
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -151,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX
|
|||
endif
|
||||
|
||||
ifeq ($(BINARY), 32)
|
||||
GETARCH_FLAGS += -DNO_AVX
|
||||
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
|
||||
NO_AVX512 = 1
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX2), 1)
|
||||
|
@ -523,6 +528,12 @@ DYNAMIC_CORE += THUNDERX
|
|||
DYNAMIC_CORE += THUNDERX2T99
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
override DYNAMIC_ARCH=
|
||||
|
|
|
@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
|
|||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
|
|
@ -48,6 +48,7 @@ POWER5
|
|||
POWER6
|
||||
POWER7
|
||||
POWER8
|
||||
POWER9
|
||||
PPCG4
|
||||
PPC970
|
||||
PPC970MP
|
||||
|
@ -90,7 +91,9 @@ CORTEXA73
|
|||
FALKOR
|
||||
THUNDERX
|
||||
THUNDERX2T99
|
||||
TSV110
|
||||
|
||||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
Z13
|
||||
Z14
|
||||
|
|
|
@ -53,9 +53,9 @@ before_build:
|
|||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||
- cd build
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
|||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
|
@ -27,29 +28,21 @@ if (p != "") {
|
|||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
A <- matrix(rnorm(n * n), nrow = n)
|
||||
ev <- 0
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
ev <- eigen(A)
|
||||
})
|
||||
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
|||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
|
@ -27,26 +28,13 @@ if (p != "") {
|
|||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
B <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
A <- matrix(runif(n * n), nrow = n)
|
||||
B <- matrix(runif(n * n), nrow = n)
|
||||
C <- 1
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
|
@ -54,11 +42,10 @@ while (n <= nto) {
|
|||
l <- l + 1
|
||||
})
|
||||
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
|||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
|
@ -27,31 +28,22 @@ if (p != "") {
|
|||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
A <- matrix(rnorm(n * n), nrow = n)
|
||||
B <- matrix(rnorm(n * n), nrow = n)
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
solve(A, B)
|
||||
})
|
||||
|
||||
mflops <-
|
||||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
|
85
c_check
85
c_check
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use File::Basename;
|
||||
use File::Temp qw(tempfile);
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
|
|||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
#$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
|
@ -31,12 +31,25 @@ if ($?) {
|
|||
|
||||
$cross_suffix = "";
|
||||
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
eval "use File::Basename";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Basename, emulating its functionality";
|
||||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
|
||||
if ($dirnam ne ".") {
|
||||
$cross_suffix .= $dirnam . "/";
|
||||
}
|
||||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
|
||||
if ($basnam =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
} else {
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
}
|
||||
|
||||
$compiler = "";
|
||||
|
@ -171,20 +184,26 @@ if ($?) {
|
|||
|
||||
$have_msa = 0;
|
||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
|
@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
|
|||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
}
|
||||
|
||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
|
15
cblas.h
15
cblas.h
|
@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
|
|||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
||||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
||||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
||||
|
@ -88,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
|||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
|
|
@ -74,6 +74,9 @@ if (DYNAMIC_ARCH)
|
|||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
|
|
|
@ -107,6 +107,12 @@ macro(SetDefaultL1)
|
|||
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(SSUMKERNEL sum.S)
|
||||
set(DSUMKERNEL sum.S)
|
||||
set(CSUMKERNEL zsum.S)
|
||||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL2)
|
||||
|
@ -162,4 +168,4 @@ macro(SetDefaultL3)
|
|||
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
endmacro ()
|
||||
endmacro ()
|
||||
|
|
|
@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
|||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
|
||||
set(EXTRALIB "${EXTRALIB} -lm")
|
||||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
|
||||
set(EXTRALIB "${EXTRALIB} -lm")
|
||||
endif ()
|
||||
|
|
|
@ -87,13 +87,18 @@ endif ()
|
|||
# Cannot run getarch on target if we are cross-compiling
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
|
||||
# Write to config as getarch would
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(TCORE ${TARGET_CORE})
|
||||
else()
|
||||
set(TCORE ${CORE})
|
||||
endif()
|
||||
|
||||
# TODO: Set up defines that getarch sets up based on every other target
|
||||
# Perhaps this should be inside a different file as it grows larger
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ${CORE}\n"
|
||||
"#define CHAR_CORENAME \"${CORE}\"\n")
|
||||
if ("${CORE}" STREQUAL "ARMV7")
|
||||
"#define ${TCORE}\n"
|
||||
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||
if ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
|
@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "ARMV8")
|
||||
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
|
@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define DTB_SIZE\t4096\n"
|
||||
"#define L2_ASSOCIATIVE\t32\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53")
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73")
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t49152\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "FALKOR")
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "THUNDERX)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "THUNDERX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${CORE}" STREQUAL "THUNDERX2T99)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define L3_ASSOCIATIVE\t32\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define VULCAN\n")
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
|
@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
endif()
|
||||
|
||||
# Or should this actually be NUM_CORES?
|
||||
|
|
|
@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
|||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
|
@ -184,6 +187,13 @@ if (DYNAMIC_ARCH)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_LIST)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
|
||||
foreach(DCORE ${DYNAMIC_LIST})
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
|
||||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if (NO_LAPACK)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
|
||||
#Disable LAPACK C interface
|
||||
|
|
|
@ -39,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
|||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
set(X86_64 1)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
||||
set(ARM 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
set(ARM64 1)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(ARM64 1)
|
||||
else()
|
||||
set(ARM 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (X86_64)
|
||||
|
@ -78,7 +86,7 @@ endif()
|
|||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
|
|
9
common.h
9
common.h
|
@ -85,6 +85,8 @@ extern "C" {
|
|||
|
||||
#if !defined(_MSC_VER)
|
||||
#include <unistd.h>
|
||||
#elif _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#include <time.h>
|
||||
|
||||
|
@ -348,6 +350,11 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef POWER9
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
|
@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) 0
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
#else
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define CDOTC_K cdotc_k
|
||||
#define CNRM2_K cnrm2_k
|
||||
#define CSCAL_K cscal_k
|
||||
#define CSUM_K csum_k
|
||||
#define CSWAP_K cswap_k
|
||||
#define CROT_K csrot_k
|
||||
|
||||
|
@ -249,6 +250,7 @@
|
|||
#define CDOTC_K gotoblas -> cdotc_k
|
||||
#define CNRM2_K gotoblas -> cnrm2_k
|
||||
#define CSCAL_K gotoblas -> cscal_k
|
||||
#define CSUM_K gotoblas -> csum_k
|
||||
#define CSWAP_K gotoblas -> cswap_k
|
||||
#define CROT_K gotoblas -> csrot_k
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define DDOTC_K ddot_k
|
||||
#define DNRM2_K dnrm2_k
|
||||
#define DSCAL_K dscal_k
|
||||
#define DSUM_K dsum_k
|
||||
#define DSWAP_K dswap_k
|
||||
#define DROT_K drot_k
|
||||
|
||||
|
@ -174,6 +175,7 @@
|
|||
#define DDOTC_K gotoblas -> ddot_k
|
||||
#define DNRM2_K gotoblas -> dnrm2_k
|
||||
#define DSCAL_K gotoblas -> dscal_k
|
||||
#define DSUM_K gotoblas -> dsum_k
|
||||
#define DSWAP_K gotoblas -> dswap_k
|
||||
#define DROT_K gotoblas -> drot_k
|
||||
|
||||
|
|
|
@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
|
|||
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
||||
|
||||
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
|
||||
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
|
||||
double BLASFUNC(dsum) (blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
|
||||
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
|
||||
|
||||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
||||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
||||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
||||
|
|
|
@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
|
|||
double zasum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
float ssum_k (BLASLONG, float *, BLASLONG);
|
||||
double dsum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
float csum_k (BLASLONG, float *, BLASLONG);
|
||||
double zsum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
float samax_k (BLASLONG, float *, BLASLONG);
|
||||
double damax_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
|
|
@ -66,6 +66,7 @@
|
|||
#define DOTC_K QDOTC_K
|
||||
#define NRM2_K QNRM2_K
|
||||
#define SCAL_K QSCAL_K
|
||||
#define SUM_K QSUM_K
|
||||
#define SWAP_K QSWAP_K
|
||||
#define ROT_K QROT_K
|
||||
|
||||
|
@ -356,6 +357,7 @@
|
|||
#define DOTC_K DDOTC_K
|
||||
#define NRM2_K DNRM2_K
|
||||
#define SCAL_K DSCAL_K
|
||||
#define SUM_K DSUM_K
|
||||
#define SWAP_K DSWAP_K
|
||||
#define ROT_K DROT_K
|
||||
|
||||
|
@ -658,6 +660,7 @@
|
|||
#define DOTC_K SDOTC_K
|
||||
#define NRM2_K SNRM2_K
|
||||
#define SCAL_K SSCAL_K
|
||||
#define SUM_K SSUM_K
|
||||
#define SWAP_K SSWAP_K
|
||||
#define ROT_K SROT_K
|
||||
|
||||
|
@ -962,6 +965,7 @@
|
|||
#define DOTC_K XDOTC_K
|
||||
#define NRM2_K XNRM2_K
|
||||
#define SCAL_K XSCAL_K
|
||||
#define SUM_K XSUM_K
|
||||
#define SWAP_K XSWAP_K
|
||||
#define ROT_K XROT_K
|
||||
|
||||
|
@ -1363,6 +1367,7 @@
|
|||
#define DOTC_K ZDOTC_K
|
||||
#define NRM2_K ZNRM2_K
|
||||
#define SCAL_K ZSCAL_K
|
||||
#define SUM_K ZSUM_K
|
||||
#define SWAP_K ZSWAP_K
|
||||
#define ROT_K ZROT_K
|
||||
|
||||
|
@ -1785,6 +1790,7 @@
|
|||
#define DOTC_K CDOTC_K
|
||||
#define NRM2_K CNRM2_K
|
||||
#define SCAL_K CSCAL_K
|
||||
#define SUM_K CSUM_K
|
||||
#define SWAP_K CSWAP_K
|
||||
#define ROT_K CROT_K
|
||||
|
||||
|
|
|
@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
|
||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
|
||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||
|
@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
||||
|
@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
|
||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
|||
|
||||
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*zsum_k) (BLASLONG, double *, BLASLONG);
|
||||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
|
@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#ifndef COMMON_POWER
|
||||
#define COMMON_POWER
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#else
|
||||
|
@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define L1_DUALFETCH
|
||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||
#define L1_PREFETCH dcbtst
|
||||
|
@ -598,9 +598,14 @@ REALNAME:;\
|
|||
#ifndef __64BIT__
|
||||
#define PROLOGUE \
|
||||
.machine "any";\
|
||||
.toc;\
|
||||
.globl .REALNAME;\
|
||||
.globl REALNAME;\
|
||||
.csect REALNAME[DS],3;\
|
||||
REALNAME:;\
|
||||
.long .REALNAME, TOC[tc0], 0;\
|
||||
.csect .text[PR],5;\
|
||||
.REALNAME:;
|
||||
.REALNAME:
|
||||
|
||||
#define EPILOGUE \
|
||||
_section_.text:;\
|
||||
|
@ -611,9 +616,14 @@ _section_.text:;\
|
|||
|
||||
#define PROLOGUE \
|
||||
.machine "any";\
|
||||
.toc;\
|
||||
.globl .REALNAME;\
|
||||
.globl REALNAME;\
|
||||
.csect REALNAME[DS],3;\
|
||||
REALNAME:;\
|
||||
.llong .REALNAME, TOC[tc0], 0;\
|
||||
.csect .text[PR], 5;\
|
||||
.REALNAME:;
|
||||
.REALNAME:
|
||||
|
||||
#define EPILOGUE \
|
||||
_section_.text:;\
|
||||
|
@ -802,7 +812,7 @@ Lmcount$lazy_ptr:
|
|||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#elif defined(POWER8) || defined(POWER9)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define QDOTC_K qdot_k
|
||||
#define QNRM2_K qnrm2_k
|
||||
#define QSCAL_K qscal_k
|
||||
#define QSUM_K qsum_k
|
||||
#define QSWAP_K qswap_k
|
||||
#define QROT_K qrot_k
|
||||
|
||||
|
@ -161,6 +162,7 @@
|
|||
#define QDOTC_K gotoblas -> qdot_k
|
||||
#define QNRM2_K gotoblas -> qnrm2_k
|
||||
#define QSCAL_K gotoblas -> qscal_k
|
||||
#define QSUM_K gotoblas -> qsum_k
|
||||
#define QSWAP_K gotoblas -> qswap_k
|
||||
#define QROT_K gotoblas -> qrot_k
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#define ISMAX_K ismax_k
|
||||
#define ISMIN_K ismin_k
|
||||
#define SASUM_K sasum_k
|
||||
#define SSUM_K ssum_k
|
||||
#define SAXPYU_K saxpy_k
|
||||
#define SAXPYC_K saxpy_k
|
||||
#define SCOPY_K scopy_k
|
||||
|
@ -170,6 +171,7 @@
|
|||
#define ISMAX_K gotoblas -> ismax_k
|
||||
#define ISMIN_K gotoblas -> ismin_k
|
||||
#define SASUM_K gotoblas -> sasum_k
|
||||
#define SSUM_K gotoblas -> ssum_k
|
||||
#define SAXPYU_K gotoblas -> saxpy_k
|
||||
#define SAXPYC_K gotoblas -> saxpy_k
|
||||
#define SCOPY_K gotoblas -> scopy_k
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define XDOTC_K xdotc_k
|
||||
#define XNRM2_K xnrm2_k
|
||||
#define XSCAL_K xscal_k
|
||||
#define XSUM_K xsum_k
|
||||
#define XSWAP_K xswap_k
|
||||
#define XROT_K xqrot_k
|
||||
|
||||
|
@ -227,6 +228,7 @@
|
|||
#define XDOTC_K gotoblas -> xdotc_k
|
||||
#define XNRM2_K gotoblas -> xnrm2_k
|
||||
#define XSCAL_K gotoblas -> xscal_k
|
||||
#define XSUM_K gotoblas -> xsum_k
|
||||
#define XSWAP_K gotoblas -> xswap_k
|
||||
#define XROT_K gotoblas -> xqrot_k
|
||||
|
||||
|
|
|
@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
|
||||
|
||||
return result;
|
||||
#endif
|
||||
|
|
|
@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
|||
"=b" (*ebx),
|
||||
"=c" (*ecx),
|
||||
"=d" (*edx)
|
||||
: "0" (op));
|
||||
: "0" (op), "c"(0));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define ZDOTC_K zdotc_k
|
||||
#define ZNRM2_K znrm2_k
|
||||
#define ZSCAL_K zscal_k
|
||||
#define ZSUM_K zsum_k
|
||||
#define ZSWAP_K zswap_k
|
||||
#define ZROT_K zdrot_k
|
||||
|
||||
|
@ -249,6 +250,7 @@
|
|||
#define ZDOTC_K gotoblas -> zdotc_k
|
||||
#define ZNRM2_K gotoblas -> znrm2_k
|
||||
#define ZSCAL_K gotoblas -> zscal_k
|
||||
#define ZSUM_K gotoblas -> zsum_k
|
||||
#define ZSWAP_K gotoblas -> zswap_k
|
||||
#define ZROT_K gotoblas -> zdrot_k
|
||||
|
||||
|
|
6
cpuid.h
6
cpuid.h
|
@ -53,6 +53,7 @@
|
|||
#define VENDOR_SIS 8
|
||||
#define VENDOR_TRANSMETA 9
|
||||
#define VENDOR_NSC 10
|
||||
#define VENDOR_HYGON 11
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
@ -116,6 +117,7 @@
|
|||
#define CORE_EXCAVATOR 26
|
||||
#define CORE_ZEN 27
|
||||
#define CORE_SKYLAKEX 28
|
||||
#define CORE_DHYANA 29
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
|
@ -139,6 +141,7 @@
|
|||
#define HAVE_FMA4 (1 << 19)
|
||||
#define HAVE_FMA3 (1 << 20)
|
||||
#define HAVE_AVX512VL (1 << 21)
|
||||
#define HAVE_AVX2 (1 << 22)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
|
@ -214,5 +217,8 @@ typedef struct {
|
|||
#define CPUTYPE_EXCAVATOR 50
|
||||
#define CPUTYPE_ZEN 51
|
||||
#define CPUTYPE_SKYLAKEX 52
|
||||
#define CPUTYPE_DHYANA 53
|
||||
|
||||
#define CPUTYPE_HYGON_UNKNOWN 54
|
||||
|
||||
#endif
|
||||
|
|
|
@ -39,6 +39,8 @@
|
|||
// Cavium
|
||||
#define CPU_THUNDERX 7
|
||||
#define CPU_THUNDERX2T99 8
|
||||
//Hisilicon
|
||||
#define CPU_TSV110 9
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
@ -49,7 +51,8 @@ static char *cpuname[] = {
|
|||
"CORTEXA73",
|
||||
"FALKOR",
|
||||
"THUNDERX",
|
||||
"THUNDERX2T99"
|
||||
"THUNDERX2T99",
|
||||
"TSV110"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
@ -61,7 +64,8 @@ static char *cpuname_lower[] = {
|
|||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99"
|
||||
"thunderx2t99",
|
||||
"tsv110"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -145,6 +149,9 @@ int detect(void)
|
|||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||
return CPU_THUNDERX2T99;
|
||||
// HiSilicon
|
||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||
return CPU_TSV110;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
|
@ -286,6 +293,21 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -94,7 +94,7 @@ char *corename[] = {
|
|||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8",
|
||||
"POWER8"
|
||||
"POWER9"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -124,7 +124,7 @@ int detect(void){
|
|||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
|
||||
|
@ -156,7 +156,7 @@ int detect(void){
|
|||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
return CPUTYPE_POWER5;
|
||||
|
@ -180,7 +180,7 @@ int id;
|
|||
__asm __volatile("mfpvr %0" : "=r"(id));
|
||||
switch ( id >> 16 ) {
|
||||
case 0x4e: // POWER9
|
||||
return CPUTYPE_POWER8;
|
||||
return CPUTYPE_POWER9;
|
||||
break;
|
||||
case 0x4d:
|
||||
case 0x4b: // POWER8/8E
|
||||
|
|
194
cpuid_x86.c
194
cpuid_x86.c
|
@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
|||
("mov %%ebx, %%edi;"
|
||||
"cpuid;"
|
||||
"xchgl %%ebx, %%edi;"
|
||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
|
||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
|
||||
#else
|
||||
__asm__ __volatile__
|
||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
|
||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -211,6 +211,44 @@ int support_avx(){
|
|||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & 32) != 32){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
|
@ -233,6 +271,7 @@ int get_vendor(void){
|
|||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
||||
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
|
||||
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
|
@ -294,6 +333,8 @@ int get_cputype(int gettype){
|
|||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
||||
#ifndef NO_AVX
|
||||
if (support_avx()) feature |= HAVE_AVX;
|
||||
if (support_avx2()) feature |= HAVE_AVX2;
|
||||
if (support_avx512()) feature |= HAVE_AVX512VL;
|
||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||
#endif
|
||||
|
||||
|
@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
}
|
||||
}
|
||||
|
||||
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) {
|
||||
if ((get_vendor() == VENDOR_AMD) ||
|
||||
(get_vendor() == VENDOR_HYGON) ||
|
||||
(get_vendor() == VENDOR_CENTAUR)) {
|
||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
LDTB.size = 4096;
|
||||
|
@ -1228,22 +1271,18 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
case 15:
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 13:
|
||||
//Broadwell
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
|
@ -1252,33 +1291,27 @@ int get_cpuname(void){
|
|||
switch (model) {
|
||||
case 5:
|
||||
case 6:
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7:
|
||||
case 15:
|
||||
//Broadwell
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14:
|
||||
//Skylake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
|
@ -1292,80 +1325,66 @@ int get_cpuname(void){
|
|||
switch (model) {
|
||||
case 6:
|
||||
//Broadwell
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
case 14:
|
||||
// Skylake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7:
|
||||
// Xeon Phi Knights Landing
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
// Apollo Lake
|
||||
case 15:
|
||||
// Denverton
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
switch (model) {
|
||||
case 6: // Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 14: // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
|
@ -1469,6 +1488,26 @@ int get_cpuname(void){
|
|||
return CPUTYPE_AMD_UNKNOWN;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_HYGON){
|
||||
switch (family) {
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 9:
|
||||
//Hygon Dhyana
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return CPUTYPE_HYGON_UNKNOWN;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CYRIX){
|
||||
switch (family) {
|
||||
case 0x4:
|
||||
|
@ -1590,7 +1629,8 @@ static char *cpuname[] = {
|
|||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
"SKYLAKEX",
|
||||
"DHYANA"
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
|
@ -1645,7 +1685,8 @@ static char *lowercpuname[] = {
|
|||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
"skylakex",
|
||||
"dhyana"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
|
@ -1677,7 +1718,8 @@ static char *corename[] = {
|
|||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
"SKYLAKEX",
|
||||
"DHYANA"
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
|
@ -1709,7 +1751,8 @@ static char *corename_lower[] = {
|
|||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
"skylakex",
|
||||
"dhyana"
|
||||
};
|
||||
|
||||
|
||||
|
@ -2026,6 +2069,23 @@ int get_coretype(void){
|
|||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_HYGON){
|
||||
if (family == 0xf){
|
||||
if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CORE_BARCELONA;
|
||||
} else {
|
||||
return CORE_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
|
@ -2112,6 +2172,8 @@ void get_cpuconfig(void){
|
|||
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
|
||||
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
|
||||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
|
||||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
|
||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||
|
@ -2180,6 +2242,8 @@ void get_sse(void){
|
|||
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
|
||||
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
|
||||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
|
||||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
|
||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||
|
|
|
@ -27,9 +27,9 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
|
@ -64,10 +64,8 @@ int detect(void)
|
|||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
@ -116,7 +114,14 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define L1_DATA_SIZE 131072\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8\n");
|
||||
printf("#define L2_SIZE 4194304\n");
|
||||
printf("#define L2_LINESIZE 256\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
2
ctest.c
2
ctest.c
|
@ -113,7 +113,7 @@ ARCH_X86
|
|||
ARCH_X86_64
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
|
||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
|
||||
ARCH_POWER
|
||||
#endif
|
||||
|
||||
|
|
|
@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
|
||||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
|
||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = trmv_kernel;
|
||||
|
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
|
||||
range_m[num_cpu + 1] = range_m[num_cpu] + width;
|
||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = trmv_kernel;
|
||||
|
|
|
@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
|||
ifeq ($(ARCH),arm64)
|
||||
COMMONOBJS += dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),power)
|
||||
COMMONOBJS += dynamic_power.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
|
@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
|||
ifeq ($(ARCH),arm64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),power)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
|
@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
SetEvent(pool.killed);
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
#endif
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
CloseHandle(pool.filled);
|
||||
CloseHandle(pool.killed);
|
||||
|
||||
blas_server_avail = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX;
|
|||
#define VENDOR_INTEL 1
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
@ -304,9 +305,49 @@ int support_avx(){
|
|||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
|
||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
|
||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
static int get_vendor(void){
|
||||
|
@ -329,6 +370,7 @@ static int get_vendor(void){
|
|||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
|
@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
//Intel Haswell
|
||||
if (model == 12 || model == 15) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 13) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
|
@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){
|
|||
case 4:
|
||||
//Intel Haswell
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 7 || model == 15) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
|
@ -457,72 +514,86 @@ static gotoblas_t *get_coretype(void){
|
|||
case 5:
|
||||
//Intel Broadwell
|
||||
if (model == 6) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Phi Knights Landing
|
||||
if (model == 7) {
|
||||
if(support_avx())
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake
|
||||
if (model == 12) {
|
||||
//Apollo Lake or Denverton
|
||||
if (model == 12 || model == 15) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return &gotoblas_HASWELL;
|
||||
#else
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return &gotoblas_NEHALEM;
|
||||
#endif
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
|
@ -535,7 +606,7 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
|
||||
if (family <= 0xe) {
|
||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
|
||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||
|
@ -615,6 +686,13 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
static char *corename[] = {
|
||||
"unknown",
|
||||
"POWER6",
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
};
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
|
||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
|
||||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char * coretype) {
|
||||
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i = 0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to POWER8 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_POWER8;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
|
@ -198,45 +198,68 @@ int get_num_procs(void);
|
|||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -1073,11 +1096,6 @@ static volatile int memory_initialized = 0;
|
|||
}
|
||||
free(table);
|
||||
}
|
||||
#if defined(OS_WINDOWS)
|
||||
TlsFree(local_storage_key);
|
||||
#else
|
||||
pthread_key_delete(local_storage_key);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void blas_memory_init(){
|
||||
|
@ -1295,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
|
|||
free(map_address);
|
||||
}
|
||||
|
||||
#ifdef SMP
|
||||
void blas_thread_memory_cleanup(void) {
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void blas_shutdown(void){
|
||||
#ifdef SMP
|
||||
BLASFUNC(blas_thread_shutdown)();
|
||||
|
@ -1304,7 +1329,7 @@ void blas_shutdown(void){
|
|||
/* Only cleanupIf we were built for threading and TLS was initialized */
|
||||
if (local_storage_key)
|
||||
#endif
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
blas_thread_memory_cleanup();
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
base_address = 0UL;
|
||||
|
@ -1491,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) {
|
|||
|
||||
blas_shutdown();
|
||||
|
||||
#if defined(SMP)
|
||||
#if defined(OS_WINDOWS)
|
||||
TlsFree(local_storage_key);
|
||||
#else
|
||||
pthread_key_delete(local_storage_key);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
|
@ -1526,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
|
|||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
#if defined(SMP)
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
blas_thread_memory_cleanup();
|
||||
#endif
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
|
@ -1600,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#endif
|
||||
|
||||
#else
|
||||
/* USE_TLS / COMPILE_TLS not set */
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||
#define ALLOC_WINDOWS
|
||||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
|
@ -1616,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#ifndef OS_WINDOWS
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
@ -1636,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
@ -1675,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
|
||||
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
||||
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
||||
#else
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
|
@ -1701,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {};
|
|||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
nums = CPU_COUNT_S(size,cpusetp);
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -1753,7 +1816,7 @@ int get_num_procs(void) {
|
|||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
|
@ -1790,7 +1853,7 @@ int get_num_procs(void) {
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
|
@ -1867,7 +1930,7 @@ void openblas_fork_handler()
|
|||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
|
@ -1880,7 +1943,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
@ -1888,11 +1951,11 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
blas_goto_num = 0;
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
@ -1904,7 +1967,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
|
@ -1912,7 +1975,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
@ -1999,11 +2062,15 @@ static void *alloc_mmap(void *address){
|
|||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
|
@ -2145,14 +2212,18 @@ static void *alloc_mmap(void *address){
|
|||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
}
|
||||
#endif
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -2520,7 +2591,7 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
int position;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int mypos;
|
||||
int mypos = 0;
|
||||
#endif
|
||||
|
||||
void *map_address;
|
||||
|
@ -2551,6 +2622,11 @@ void *blas_memory_alloc(int procpos){
|
|||
NULL,
|
||||
};
|
||||
void *(**func)(void *address);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory_initialized) {
|
||||
#endif
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
|
@ -2586,6 +2662,9 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#if defined(USE_OPENMP)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Alloc Start ...\n");
|
||||
|
@ -2600,13 +2679,17 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
// blas_lock(&memory[position].lock);
|
||||
|
||||
#else
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
// blas_unlock(&memory[position].lock);
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
position ++;
|
||||
|
@ -2618,21 +2701,26 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
position = 0;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
/* if (!memory[position].used) { */
|
||||
/* blas_lock(&memory[position].lock);*/
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory[position].used) {
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
/* } */
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
blas_unlock(&memory[position].lock);
|
||||
}
|
||||
#endif
|
||||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
goto error;
|
||||
|
||||
allocation :
|
||||
|
@ -2642,10 +2730,11 @@ void *blas_memory_alloc(int procpos){
|
|||
#endif
|
||||
|
||||
memory[position].used = 1;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].addr) {
|
||||
do {
|
||||
#ifdef DEBUG
|
||||
|
@ -2690,9 +2779,13 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
memory[position].addr = map_address;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
||||
|
@ -2746,8 +2839,9 @@ void blas_memory_free(void *free_area){
|
|||
#endif
|
||||
|
||||
position = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
|
@ -2761,7 +2855,9 @@ void blas_memory_free(void *free_area){
|
|||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap Succeeded.\n\n");
|
||||
|
@ -2776,8 +2872,9 @@ void blas_memory_free(void *free_area){
|
|||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#if _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static char* openblas_config_str=""
|
||||
"OpenBLAS "
|
||||
VERSION
|
||||
|
|
|
@ -141,6 +141,14 @@ else
|
|||
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else
|
||||
|
||||
ifneq ($(C_COMPILER), LSB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
|
@ -152,6 +160,7 @@ else
|
|||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
endif
|
||||
endif
|
||||
rm -f linktest
|
||||
|
||||
|
|
|
@ -40,15 +40,25 @@
|
|||
|
||||
void gotoblas_init(void);
|
||||
void gotoblas_quit(void);
|
||||
#if defined(SMP) && defined(USE_TLS)
|
||||
void blas_thread_memory_cleanup(void);
|
||||
#endif
|
||||
|
||||
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
|
||||
|
||||
if (reason == DLL_PROCESS_ATTACH) {
|
||||
gotoblas_init();
|
||||
}
|
||||
|
||||
if (reason == DLL_PROCESS_DETACH) {
|
||||
gotoblas_quit();
|
||||
switch(reason) {
|
||||
case DLL_PROCESS_ATTACH:
|
||||
gotoblas_init();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
gotoblas_quit();
|
||||
break;
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
#if defined(SMP) && defined(USE_TLS)
|
||||
blas_thread_memory_cleanup();
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
|
62
getarch.c
62
getarch.c
|
@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
#else
|
||||
#define NO_AVX512
|
||||
#endif
|
||||
/* #define FORCE_P2 */
|
||||
/* #define FORCE_KATMAI */
|
||||
/* #define FORCE_COPPERMINE */
|
||||
|
@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
|
@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "skylakex"
|
||||
#define CORENAME "SKYLAKEX"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
|
@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "POWER8"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER9)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER9"
|
||||
#define SUBDIRNAME "power"
|
||||
#define ARCHCONFIG "-DPOWER9 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "power9"
|
||||
#define CORENAME "POWER9"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_PPCG4
|
||||
#define FORCE
|
||||
|
@ -1046,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_TSV110
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "TSV110"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTSV110 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "tsv110"
|
||||
#define CORENAME "TSV110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
|
@ -1066,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "Z13"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_Z14
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
#define SUBARCHITECTURE "Z14"
|
||||
#define ARCHCONFIG "-DZ14 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64"
|
||||
#define LIBNAME "z14"
|
||||
#define CORENAME "Z14"
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt"
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
|
||||
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
|
||||
#ifndef POWER
|
||||
|
|
|
@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
|
|||
rotm.c rotmg.c # N.B. these do not have complex counterparts
|
||||
rot.c
|
||||
asum.c
|
||||
sum.c
|
||||
)
|
||||
|
||||
# these will have 'z' prepended for the complex version
|
||||
|
@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
|
||||
|
@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ SBLAS1OBJS = \
|
|||
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
|
||||
scopy.$(SUFFIX) sscal.$(SUFFIX) \
|
||||
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
|
||||
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
|
||||
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
|
||||
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
|
||||
|
@ -51,7 +51,7 @@ DBLAS1OBJS = \
|
|||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
||||
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
|
||||
ddot.$(SUFFIX) \
|
||||
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
|
||||
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
|
||||
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
|
||||
|
@ -76,7 +76,7 @@ CBLAS1OBJS = \
|
|||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
|
||||
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
|
||||
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||
scamax.$(SUFFIX) icamax.$(SUFFIX) \
|
||||
scamin.$(SUFFIX) icamin.$(SUFFIX) \
|
||||
csrot.$(SUFFIX) crotg.$(SUFFIX) \
|
||||
|
@ -105,7 +105,7 @@ ZBLAS1OBJS = \
|
|||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
|
||||
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
|
||||
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
|
||||
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
|
||||
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
|
||||
|
@ -146,7 +146,7 @@ QBLAS1OBJS = \
|
|||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||
qdot.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||
|
@ -168,7 +168,7 @@ XBLAS1OBJS = \
|
|||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||
|
@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
|
|||
QBLAS1OBJS = \
|
||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||
|
@ -224,7 +224,7 @@ QBLAS3OBJS = \
|
|||
XBLAS1OBJS = \
|
||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||
|
@ -263,7 +263,8 @@ CSBLAS1OBJS = \
|
|||
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||
|
||||
CSBLAS2OBJS = \
|
||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||
|
@ -280,7 +281,8 @@ CDBLAS1OBJS = \
|
|||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||
|
||||
CDBLAS2OBJS = \
|
||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||
|
@ -300,7 +302,8 @@ CCBLAS1OBJS = \
|
|||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX)
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||
|
@ -326,7 +329,9 @@ CZBLAS1OBJS = \
|
|||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX)
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||
|
||||
|
||||
CZBLAS2OBJS = \
|
||||
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
|
||||
|
@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
|
|||
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
@ -1383,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c
|
|||
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
@ -1395,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
|
|||
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
@ -1402,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
|
|||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
||||
|
||||
BLASLONG n = *N;
|
||||
BLASLONG incx = *INCX;
|
||||
FLOATRET ret;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
ret = (FLOATRET)SUM_K(n, x, incx);
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE, n, n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
#ifdef COMPLEX
|
||||
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||
FLOAT *x = (FLOAT*) vx;
|
||||
#else
|
||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||
#endif
|
||||
|
||||
FLOAT ret;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
ret = SUM_K(n, x, incx);
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE, n, n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
#ifdef SMP
|
||||
/* nthreads = num_cpu_avail(2);
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
FIXME trmv_thread was found to be broken, see issue 1332 */
|
||||
nthreads = 1;
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
|
|
|
@ -81,6 +81,12 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define SMP_FACTOR 256
|
||||
#else
|
||||
#define SMP_FACTOR 128
|
||||
#endif
|
||||
|
||||
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
#ifndef TRMM
|
||||
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
|
||||
|
@ -198,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
|
|||
if (side < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
mode |= (trans << BLAS_TRANSA_SHIFT);
|
||||
mode |= (side << BLAS_RSIDE_SHIFT);
|
||||
|
||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
/*
|
||||
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
*/
|
||||
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
|
|
|
@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
} else
|
||||
nthreads = 1;
|
||||
|
||||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
|
|
|
@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})
|
||||
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})
|
||||
|
|
|
@ -340,6 +340,32 @@ ifndef XSCALKERNEL
|
|||
XSCALKERNEL = zscal.S
|
||||
endif
|
||||
|
||||
### SUM ###
|
||||
|
||||
ifndef SSUMKERNEL
|
||||
SSUMKERNEL = sum.S
|
||||
endif
|
||||
|
||||
ifndef DSUMKERNEL
|
||||
DSUMKERNEL = sum.S
|
||||
endif
|
||||
|
||||
ifndef CSUMKERNEL
|
||||
CSUMKERNEL = zsum.S
|
||||
endif
|
||||
|
||||
ifndef ZSUMKERNEL
|
||||
ZSUMKERNEL = zsum.S
|
||||
endif
|
||||
|
||||
ifndef QSUMKERNEL
|
||||
QSUMKERNEL = sum.S
|
||||
endif
|
||||
|
||||
ifndef XSUMKERNEL
|
||||
XSUMKERNEL = zsum.S
|
||||
endif
|
||||
|
||||
### SWAP ###
|
||||
|
||||
ifndef SSWAPKERNEL
|
||||
|
@ -453,7 +479,7 @@ endif
|
|||
SBLASOBJS += \
|
||||
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
|
||||
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
|
||||
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
|
||||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
|
||||
saxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
|
@ -463,31 +489,32 @@ DBLASOBJS += \
|
|||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
|
||||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
|
||||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
|
||||
daxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
QBLASOBJS += \
|
||||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
|
||||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
|
||||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
|
||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
|
||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
|
||||
qsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CBLASOBJS += \
|
||||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
|
||||
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZBLASOBJS += \
|
||||
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
|
||||
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
XBLASOBJS += \
|
||||
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
|
||||
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
|
||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
### AMAX ###
|
||||
|
||||
|
@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
|
||||
|
||||
|
||||
### ASUM ###
|
||||
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
|
@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
### SUM ###
|
||||
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
### AXPY ###
|
||||
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
ifeq ($(CORE), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
@ -44,10 +44,18 @@ ifeq ($(CORE), POWER8)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define I $19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f19
|
||||
|
||||
#define t0 $f20
|
||||
#define t1 $f21
|
||||
#define t2 $f22
|
||||
#define t3 $f23
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fclr s0
|
||||
unop
|
||||
fclr t0
|
||||
ble N, $L999
|
||||
|
||||
sra N, 3, I
|
||||
fclr s1
|
||||
fclr s2
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
fclr t1
|
||||
SXADDQ INCX, X, X
|
||||
fclr t2
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
fclr t3
|
||||
SXADDQ INCX, X, X
|
||||
fclr s3
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a5, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD s0, t0, s0
|
||||
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
|
||||
fmov a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a7, 0 * SIZE(X)
|
||||
fmov a2, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
fmov a3, t3
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD a1, 0 * SIZE(X)
|
||||
fmov a4, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a2, 0 * SIZE(X)
|
||||
fmov a5, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a3, 0 * SIZE(X)
|
||||
fmov a6, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
fmov a7, t3
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a0, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a7, 0 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
fmov a2, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
fmov a4, t0
|
||||
ADD s1, t1, s1
|
||||
fmov a5, t1
|
||||
ADD s2, t2, s2
|
||||
fmov a6, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a7, t3
|
||||
|
||||
ADD s1, t1, s1
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
ADD s0, s1, s0
|
||||
ADD s2, s3, s2
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
ADD s0, s2, s0
|
||||
unop
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD s0, t0, s0
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
fmov a0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s0, t0, s0
|
||||
ret
|
||||
EPILOGUE
|
|
@ -0,0 +1,208 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define I $19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f19
|
||||
|
||||
#define t0 $f20
|
||||
#define t1 $f21
|
||||
#define t2 $f22
|
||||
#define t3 $f23
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fclr s0
|
||||
unop
|
||||
fclr t0
|
||||
addq INCX, INCX, INCX
|
||||
|
||||
fclr s1
|
||||
unop
|
||||
fclr t1
|
||||
ble N, $L999
|
||||
|
||||
fclr s2
|
||||
sra N, 2, I
|
||||
fclr s3
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
fclr t2
|
||||
LD a1, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
LD a5, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
lda I, -1(I)
|
||||
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD s0, t0, s0
|
||||
ldl $31, PREFETCHSIZE * SIZE(X)
|
||||
fmov a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a1, t1
|
||||
unop
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a7, 1 * SIZE(X)
|
||||
fmov a2, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
fmov a3, t3
|
||||
unop
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD a1, 1 * SIZE(X)
|
||||
fmov a4, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a2, 0 * SIZE(X)
|
||||
fmov a5, t1
|
||||
unop
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a3, 1 * SIZE(X)
|
||||
fmov a6, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
fmov a7, t3
|
||||
unop
|
||||
|
||||
LD a5, 1 * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a0, t0
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a7, 1 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
fmov a2, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
fmov a4, t0
|
||||
ADD s1, t1, s1
|
||||
fmov a5, t1
|
||||
ADD s2, t2, s2
|
||||
fmov a6, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a7, t3
|
||||
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
ADD s0, s2, s0
|
||||
and N, 3, I
|
||||
ADD s1, s3, s1
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD s0, t0, s0
|
||||
LD a0, 0 * SIZE(X)
|
||||
fmov a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a1, 1 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s0, t0, s0
|
||||
ADD s1, t1, s1
|
||||
|
||||
ADD s0, s1, s0
|
||||
ret
|
||||
EPILOGUE
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
|
|||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
|
|
@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S
|
|||
CASUMKERNEL = asum_vfp.S
|
||||
ZASUMKERNEL = asum_vfp.S
|
||||
|
||||
SSUMKERNEL = sum_vfp.S
|
||||
DSUMKERNEL = sum_vfp.S
|
||||
|
||||
SAXPYKERNEL = axpy_vfp.S
|
||||
DAXPYKERNEL = axpy_vfp.S
|
||||
CAXPYKERNEL = axpy_vfp.S
|
||||
|
|
|
@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > minf )
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* trivial copy of asum.c with the ABS() removed *
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,425 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
|
||||
**************************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
|
||||
|
||||
#define I r12
|
||||
|
||||
#define X_PRE 512
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vadd.f64 d1 , d1, d5
|
||||
vadd.f64 d0 , d0, d6
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vadd.f32 s1 , s1, s5
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vadd.f64 d1 , d1, d5
|
||||
vadd.f64 d0 , d0, d6
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vadd.f64 d1 , d1, d5
|
||||
vadd.f64 d0 , d0, d6
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vadd.f32 s1 , s1, s5
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vadd.f32 s1 , s1, s5
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
movs r12, #0 // clear floating point register
|
||||
vmov s0, r12
|
||||
vmov s1, r12
|
||||
#if defined(DOUBLE)
|
||||
vcvt.f64.f32 d0, s0
|
||||
vcvt.f64.f32 d1, s1
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble asum_kernel_L999
|
||||
|
||||
cmp INC_X, #0
|
||||
beq asum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne asum_kernel_S_BEGIN
|
||||
|
||||
|
||||
asum_kernel_F_BEGIN:
|
||||
|
||||
asrs I, N, #2 // I = N / 4
|
||||
ble asum_kernel_F1
|
||||
|
||||
.align 5
|
||||
|
||||
asum_kernel_F4:
|
||||
|
||||
#if !defined(DOUBLE) && !defined(COMPLEX)
|
||||
pld [ X, #X_PRE ]
|
||||
#endif
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
ble asum_kernel_F1
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F4
|
||||
|
||||
asum_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F10
|
||||
|
||||
b asum_kernel_L999
|
||||
|
||||
asum_kernel_S_BEGIN:
|
||||
|
||||
#if defined(COMPLEX)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3 // INC_X * SIZE
|
||||
#else
|
||||
lsl INC_X, INC_X, #2 // INC_X * SIZE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
asrs I, N, #2 // I = N / 4
|
||||
ble asum_kernel_S1
|
||||
|
||||
.align 5
|
||||
|
||||
asum_kernel_S4:
|
||||
|
||||
KERNEL_S4
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S4
|
||||
|
||||
asum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S10
|
||||
|
||||
|
||||
asum_kernel_L999:
|
||||
|
||||
|
||||
#if defined(DOUBLE)
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
#else
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* trivial copy of zasum.c with the ABS() removed *
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CSUM1(x,i) x[i]+x[i+1]
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
n *= inc_x2;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += CSUM1(x,i);
|
||||
i += inc_x2;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define REG0 wzr
|
||||
#define SUMF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 {v1.2s}, [X], #8
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, TMPF, s2
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
|
||||
add X, X, #64
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
|
||||
fadd v1.4s, v1.4s, v2.4s
|
||||
fadd v3.4s, v3.4s, v4.4s
|
||||
fadd v0.4s, v0.4s, v1.4s
|
||||
fadd v0.4s, v0.4s, v3.4s
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SUMF, v0.2s
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, TMPF, s2
|
||||
fadd SUMF, SUMF, TMPF
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
fmov s1, SUMF
|
||||
|
||||
cmp N, xzr
|
||||
ble .Lcsum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble .Lcsum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne .Lcsum_kernel_S_BEGIN
|
||||
|
||||
.Lcsum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq .Lcsum_kernel_F1
|
||||
|
||||
.Lcsum_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
.Lcsum_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble .Lcsum_kernel_L999
|
||||
|
||||
.Lcsum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_F10
|
||||
|
||||
.Lcsum_kernel_L999:
|
||||
ret
|
||||
|
||||
.Lcsum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble .Lcsum_kernel_S1
|
||||
|
||||
.Lcsum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_S4
|
||||
|
||||
.Lcsum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lcsum_kernel_L999
|
||||
|
||||
.Lcsum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,186 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define REG0 wzr
|
||||
#define SUMF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define SUMF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
|
||||
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
|
||||
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
|
||||
add X, X, #64
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
|
||||
fadd v2.2d, v2.2d, v3.2d
|
||||
fadd v4.2d, v4.2d, v5.2d
|
||||
fadd v0.2d, v0.2d, v2.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SUMF, v0.2s
|
||||
#else
|
||||
faddp SUMF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
#if !defined(DOUBLE)
|
||||
fmov s1, SUMF
|
||||
#else
|
||||
fmov d1, SUMF
|
||||
#endif
|
||||
|
||||
cmp N, xzr
|
||||
ble .Lsum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble .Lsum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne .Lsum_kernel_S_BEGIN
|
||||
|
||||
.Lsum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq .Lsum_kernel_F1
|
||||
|
||||
.Lsum_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
.Lsum_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble .Lsum_kernel_L999
|
||||
|
||||
.Lsum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_F10
|
||||
|
||||
.Lsum_kernel_L999:
|
||||
ret
|
||||
|
||||
.Lsum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble .Lsum_kernel_S1
|
||||
|
||||
.Lsum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_S4
|
||||
|
||||
.Lsum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lsum_kernel_L999
|
||||
|
||||
.Lsum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,158 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define REG0 xzr
|
||||
#define SUMF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 {v1.2d}, [X], #16
|
||||
faddp TMPF, v1.2d
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
|
||||
|
||||
fadd v1.2d, v1.2d, v2.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v0.2d, v0.2d, v3.2d
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
faddp SUMF, v0.2d
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #4
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
faddp TMPF, v1.2d
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
|
||||
cmp N, xzr
|
||||
ble .Lzsum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble .Lzsum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne .Lzsum_kernel_S_BEGIN
|
||||
|
||||
.Lzsum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq .Lzsum_kernel_F1
|
||||
|
||||
.Lzsum_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_F4
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
.Lzsum_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lzsum_kernel_L999
|
||||
|
||||
.Lzsum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_F10
|
||||
|
||||
.Lzsum_kernel_L999:
|
||||
ret
|
||||
|
||||
.Lzsum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble .Lzsum_kernel_S1
|
||||
|
||||
.Lzsum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_S4
|
||||
|
||||
.Lzsum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lzsum_kernel_L999
|
||||
|
||||
.Lzsum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -60,6 +60,10 @@ CASUMKERNEL = asum.S
|
|||
ZASUMKERNEL = asum.S
|
||||
XASUMKERNEL = asum.S
|
||||
|
||||
CSUMKERNEL = sum.S
|
||||
ZSUMKERNEL = sum.S
|
||||
XSUMKERNEL = sum.S
|
||||
|
||||
CNRM2KERNEL = nrm2.S
|
||||
ZNRM2KERNEL = nrm2.S
|
||||
XNRM2KERNEL = nrm2.S
|
||||
|
|
|
@ -0,0 +1,358 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2019, The OpenBLAS project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define PREFETCH_SIZE ( 8 * 16 + 4)
|
||||
#elif defined(DOUBLE)
|
||||
#define PREFETCH_SIZE (16 * 16 + 8)
|
||||
#else
|
||||
#define PREFETCH_SIZE (32 * 16 + 16)
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define COMPADD 0
|
||||
#define STRIDE INCX
|
||||
#else
|
||||
#define COMPADD 1
|
||||
#define STRIDE SIZE
|
||||
#endif
|
||||
|
||||
#define PRE1 r2
|
||||
|
||||
#define I r17
|
||||
#define J r18
|
||||
#define INCX16 r21
|
||||
|
||||
#define PR r30
|
||||
#define ARLC r31
|
||||
|
||||
#define N r32
|
||||
#define X r33
|
||||
#define INCX r34
|
||||
|
||||
|
||||
PROLOGUE
|
||||
.prologue
|
||||
PROFCODE
|
||||
{ .mfi
|
||||
adds PRE1 = PREFETCH_SIZE * SIZE, X
|
||||
mov f8 = f0
|
||||
.save ar.lc, ARLC
|
||||
mov ARLC = ar.lc
|
||||
}
|
||||
;;
|
||||
.body
|
||||
#ifdef F_INTERFACE
|
||||
{ .mmi
|
||||
LDINT N = [N]
|
||||
LDINT INCX = [INCX]
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
#ifndef USE64BITINT
|
||||
{ .mii
|
||||
nop.m 0
|
||||
sxt4 N = N
|
||||
sxt4 INCX = INCX
|
||||
}
|
||||
;;
|
||||
#endif
|
||||
#endif
|
||||
{ .mmi
|
||||
cmp.lt p0, p6 = r0, INCX
|
||||
cmp.lt p0, p7 = r0, N
|
||||
shr I = N, (4 - COMPADD)
|
||||
}
|
||||
{ .mbb
|
||||
and J = ((1 << (4 - COMPADD)) - 1), N
|
||||
(p6) br.ret.sptk.many b0
|
||||
(p7) br.ret.sptk.many b0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
adds I = -1, I
|
||||
mov f10 = f0
|
||||
mov PR = pr
|
||||
}
|
||||
{ .mfi
|
||||
cmp.eq p9, p0 = r0, J
|
||||
mov f9 = f0
|
||||
tbit.z p0, p12 = N, 3 - COMPADD
|
||||
}
|
||||
;;
|
||||
{ .mmi
|
||||
cmp.eq p16, p0 = r0, r0
|
||||
cmp.ne p17, p0 = r0, r0
|
||||
mov ar.ec= 3
|
||||
}
|
||||
{ .mfi
|
||||
cmp.ne p18, p0 = r0, r0
|
||||
mov f11 = f0
|
||||
shl INCX = INCX, BASE_SHIFT + COMPADD
|
||||
}
|
||||
;;
|
||||
{ .mmi
|
||||
#ifdef XDOUBLE
|
||||
shladd INCX16 = INCX, (3 - COMPADD), r0
|
||||
#else
|
||||
shladd INCX16 = INCX, (4 - COMPADD), r0
|
||||
#endif
|
||||
cmp.ne p19, p0 = r0, r0
|
||||
mov ar.lc = I
|
||||
}
|
||||
{ .mmb
|
||||
cmp.gt p8 ,p0 = r0, I
|
||||
#ifdef COMPLEX
|
||||
adds INCX = - SIZE, INCX
|
||||
#else
|
||||
nop.m 0
|
||||
#endif
|
||||
(p8) br.cond.dpnt .L55
|
||||
}
|
||||
;;
|
||||
.align 32
|
||||
|
||||
.L52:
|
||||
{ .mmf
|
||||
(p16) lfetch.nt1 [PRE1], INCX16
|
||||
(p16) LDFD f32 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f8 = f8, f71
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f35 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f9 = f9, f74
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f38 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f10 = f10, f77
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f41 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f11 = f11, f80
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f44 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f8 = f8, f34
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f47 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f9 = f9, f37
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f50 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f10 = f10, f40
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f53 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f11 = f11, f43
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
#ifdef XDOUBLE
|
||||
(p16) lfetch.nt1 [PRE1], INCX16
|
||||
#endif
|
||||
(p16) LDFD f56 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f8 = f8, f46
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f59 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f9 = f9, f49
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f62 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f10 = f10, f52
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f65 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f11 = f11, f55
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f68 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f8 = f8, f58
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f71 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f9 = f9, f61
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f74 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f10 = f10, f64
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f77 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f11 = f11, f67
|
||||
br.ctop.sptk.few .L52
|
||||
}
|
||||
;;
|
||||
FADD f8 = f8, f71
|
||||
FADD f9 = f9, f74
|
||||
FADD f10 = f10, f77
|
||||
FADD f11 = f11, f80
|
||||
.align 32
|
||||
;;
|
||||
.L55:
|
||||
(p12) LDFD f32 = [X], STRIDE
|
||||
(p9) br.cond.dptk .L998
|
||||
;;
|
||||
(p12) LDFD f33 = [X], INCX
|
||||
;;
|
||||
(p12) LDFD f34 = [X], STRIDE
|
||||
;;
|
||||
(p12) LDFD f35 = [X], INCX
|
||||
tbit.z p0, p13 = N, (2 - COMPADD)
|
||||
;;
|
||||
(p12) LDFD f36 = [X], STRIDE
|
||||
tbit.z p0, p14 = N, (1 - COMPADD)
|
||||
;;
|
||||
(p12) LDFD f37 = [X], INCX
|
||||
#ifndef COMPLEX
|
||||
tbit.z p0, p15 = N, 0
|
||||
#endif
|
||||
;;
|
||||
(p12) LDFD f38 = [X], STRIDE
|
||||
;;
|
||||
(p12) LDFD f39 = [X], INCX
|
||||
;;
|
||||
(p13) LDFD f40 = [X], STRIDE
|
||||
;;
|
||||
(p13) LDFD f41 = [X], INCX
|
||||
;;
|
||||
(p13) LDFD f42 = [X], STRIDE
|
||||
(p12) FADD f8 = f8, f32
|
||||
;;
|
||||
(p13) LDFD f43 = [X], INCX
|
||||
(p12) FADD f9 = f9, f33
|
||||
;;
|
||||
(p14) LDFD f44 = [X], STRIDE
|
||||
(p12) FADD f10 = f10, f34
|
||||
;;
|
||||
(p14) LDFD f45 = [X], INCX
|
||||
(p12) FADD f11 = f11, f35
|
||||
;;
|
||||
#ifndef COMPLEX
|
||||
(p15) LDFD f46 = [X]
|
||||
#endif
|
||||
(p12) FADD f8 = f8, f36
|
||||
;;
|
||||
(p12) FADD f9 = f9, f37
|
||||
(p12) FADD f10 = f10, f38
|
||||
(p12) FADD f11 = f11, f39
|
||||
;;
|
||||
(p13) FADD f8 = f8, f40
|
||||
(p13) FADD f9 = f9, f41
|
||||
#ifndef COMPLEX
|
||||
#endif
|
||||
(p13) FADD f10 = f10, f42
|
||||
;;
|
||||
(p13) FADD f11 = f11, f43
|
||||
(p14) FADD f8 = f8, f44
|
||||
(p14) FADD f9 = f9, f45
|
||||
#ifndef COMPLEX
|
||||
(p15) FADD f10 = f10, f46
|
||||
#endif
|
||||
;;
|
||||
.align 32
|
||||
|
||||
.L998:
|
||||
{ .mfi
|
||||
FADD f8 = f8, f9
|
||||
mov ar.lc = ARLC
|
||||
}
|
||||
{ .mmf
|
||||
FADD f10 = f10, f11
|
||||
}
|
||||
;;
|
||||
{ .mii
|
||||
mov pr = PR, -65474
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
FADD f8 = f8, f10
|
||||
br.ret.sptk.many b0
|
||||
}
|
||||
EPILOGUE
|
|
@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c
|
|||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
SSUMKERNEL = ../mips/sum.c
|
||||
DSUMKERNEL = ../mips/sum.c
|
||||
CSUMKERNEL = ../mips/zsum.c
|
||||
ZSUMKERNEL = ../mips/zsum.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SASUMKERNEL = ../mips/sasum_msa.c
|
||||
DASUMKERNEL = ../mips/dasum_msa.c
|
||||
|
|
|
@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > minf )
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CSUM1(x,i) x[i]+x[i+1]
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
n *= inc_x2;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += CSUM1(x,i);
|
||||
i += inc_x2;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,332 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $4
|
||||
#define X $5
|
||||
#define INCX $6
|
||||
|
||||
#define I $2
|
||||
#define TEMP $3
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
|
||||
#define t1 $f10
|
||||
#define t2 $f11
|
||||
#define t3 $f12
|
||||
#define t4 $f13
|
||||
|
||||
#define s1 $f0
|
||||
#define s2 $f1
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC $0, s1
|
||||
|
||||
MTC $0, s2
|
||||
dsll INCX, INCX, BASE_SHIFT
|
||||
|
||||
blez N, .L999
|
||||
li TEMP, SIZE
|
||||
|
||||
bne INCX, TEMP, .L20
|
||||
dsra I, N, 3
|
||||
|
||||
blez I, .L15
|
||||
NOP
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
LD a2, 1 * SIZE(X)
|
||||
LD a3, 2 * SIZE(X)
|
||||
LD a4, 3 * SIZE(X)
|
||||
|
||||
LD a5, 4 * SIZE(X)
|
||||
MOV t1, a1
|
||||
LD a6, 5 * SIZE(X)
|
||||
MOV t2, a2
|
||||
LD a7, 6 * SIZE(X)
|
||||
MOV t3, a3
|
||||
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
blez I, .L13
|
||||
LD a8, 7 * SIZE(X)
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ADD s1, s1, t1
|
||||
LD a1, 8 * SIZE(X)
|
||||
|
||||
MOV t1, a5
|
||||
daddiu I, I, -1
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a2, 9 * SIZE(X)
|
||||
|
||||
MOV t2, a6
|
||||
NOP
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a3, 10 * SIZE(X)
|
||||
|
||||
MOV t3, a7
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a4, 11 * SIZE(X)
|
||||
|
||||
MOV t4, a8
|
||||
daddiu X, X, 8 * SIZE
|
||||
|
||||
ADD s1, s1, t1
|
||||
LD a5, 4 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a6, 5 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
NOP
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a7, 6 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a8, 7 * SIZE(X)
|
||||
|
||||
bgtz I, .L12
|
||||
MOV t4, a4
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ADD s1, s1, t1
|
||||
daddiu X, X, 8 * SIZE
|
||||
|
||||
MOV t1, a5
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t2
|
||||
MOV t2, a6
|
||||
|
||||
ADD s1, s1, t3
|
||||
MOV t3, a7
|
||||
|
||||
ADD s2, s2, t4
|
||||
MOV t4, a8
|
||||
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
|
||||
blez I, .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddiu I, I, -1
|
||||
|
||||
MOV t1, a1
|
||||
|
||||
ADD s1, s1, t1
|
||||
|
||||
bgtz I, .L16
|
||||
daddiu X, X, SIZE
|
||||
|
||||
j .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
blez I, .L25
|
||||
NOP
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a3, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a6, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
MOV t1, a1
|
||||
LD a7, 0 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
MOV t3, a3
|
||||
LD a8, 0 * SIZE(X)
|
||||
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
blez I, .L24
|
||||
daddu X, X, INCX
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a5
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a2, 0 * SIZE(X)
|
||||
|
||||
MOV t2, a6
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a3, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a7
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a4, 0 * SIZE(X)
|
||||
|
||||
MOV t4, a8
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t1
|
||||
LD a5, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a6, 0 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a7, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a8, 0 * SIZE(X)
|
||||
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
bgtz I, .L23
|
||||
daddu X, X, INCX
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
MOV t1, a5
|
||||
|
||||
ADD s2, s2, t2
|
||||
MOV t2, a6
|
||||
|
||||
ADD s1, s1, t3
|
||||
MOV t3, a7
|
||||
|
||||
ADD s2, s2, t4
|
||||
MOV t4, a8
|
||||
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
|
||||
blez I, .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddiu I, I, -1
|
||||
|
||||
MOV t1, a1
|
||||
daddu X, X, INCX
|
||||
|
||||
bgtz I, .L26
|
||||
ADD s1, s1, t1
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
j $31
|
||||
ADD s1, s1, s2
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,204 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $4
|
||||
#define X $5
|
||||
#define INCX $6
|
||||
|
||||
#define I $2
|
||||
#define TEMP $3
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
|
||||
#define t1 $f10
|
||||
#define t2 $f11
|
||||
#define t3 $f12
|
||||
#define t4 $f13
|
||||
|
||||
#define s1 $f0
|
||||
#define s2 $f1
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC $0, s1
|
||||
|
||||
MTC $0, s2
|
||||
dsll INCX, INCX, ZBASE_SHIFT
|
||||
|
||||
blez N, .L999
|
||||
dsra I, N, 2
|
||||
|
||||
blez I, .L25
|
||||
NOP
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
LD a2, 1 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a3, 0 * SIZE(X)
|
||||
LD a4, 1 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
LD a6, 1 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
MOV t1, a1
|
||||
MOV t2, a2
|
||||
|
||||
LD a7, 0 * SIZE(X)
|
||||
LD a8, 1 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
blez I, .L24
|
||||
daddu X, X, INCX
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a5
|
||||
daddiu I, I, -1
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a2, 1 * SIZE(X)
|
||||
|
||||
MOV t2, a6
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a3, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a7
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a4, 1 * SIZE(X)
|
||||
|
||||
MOV t4, a8
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t1
|
||||
LD a5, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a6, 1 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a7, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
LD a8, 1 * SIZE(X)
|
||||
|
||||
ADD s2, s2, t4
|
||||
daddu X, X, INCX
|
||||
|
||||
bgtz I, .L23
|
||||
MOV t4, a4
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
MOV t1, a5
|
||||
|
||||
ADD s2, s2, t2
|
||||
MOV t2, a6
|
||||
|
||||
ADD s1, s1, t3
|
||||
MOV t3, a7
|
||||
|
||||
ADD s2, s2, t4
|
||||
MOV t4, a8
|
||||
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
|
||||
blez I, .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, 0 * SIZE(X)
|
||||
LD a2, 1 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
daddiu I, I, -1
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t1
|
||||
bgtz I, .L26
|
||||
ADD s2, s2, t2
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
j $31
|
||||
ADD s1, s1, s2
|
||||
|
||||
EPILOGUE
|
|
@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
|||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
|
@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
#ISAMAXKERNEL = ../arm/iamax.c
|
||||
ISAMAXKERNEL = isamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
#ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
#ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
#ICAMINKERNEL = ../arm/izamin.c
|
||||
ISAMINKERNEL = isamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
|
@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c
|
|||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
#SAXPYKERNEL = ../arm/axpy.c
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
#CAXPYKERNEL = ../arm/zaxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
|
@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c
|
|||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
#CDOTKERNEL = ../arm/zdot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
|
@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c
|
|||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = zrot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
|
@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c
|
|||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
#SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
#CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
#SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
#CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = strmm_kernel_16x8_power8.S
|
||||
DTRMMKERNEL = dgemm_kernel_power9.S
|
||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power9.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||
|
||||
#Pure C for other kernels
|
||||
#SAMAXKERNEL = ../arm/amax.c
|
||||
#DAMAXKERNEL = ../arm/amax.c
|
||||
#CAMAXKERNEL = ../arm/zamax.c
|
||||
#ZAMAXKERNEL = ../arm/zamax.c
|
||||
#
|
||||
#SAMINKERNEL = ../arm/amin.c
|
||||
#DAMINKERNEL = ../arm/amin.c
|
||||
#CAMINKERNEL = ../arm/zamin.c
|
||||
#ZAMINKERNEL = ../arm/zamin.c
|
||||
#
|
||||
#SMAXKERNEL = ../arm/max.c
|
||||
#DMAXKERNEL = ../arm/max.c
|
||||
#
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ISAMAXKERNEL = isamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ISAMINKERNEL = isamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
#IDMAXKERNEL = ../arm/imax.c
|
||||
#
|
||||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
|
@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "casum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,145 @@
|
|||
/*
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#ifndef HAVE_ASM_KERNEL
|
||||
#include <altivec.h>
|
||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r};
|
||||
register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i};
|
||||
|
||||
#else
|
||||
register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r};
|
||||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
|
||||
#endif
|
||||
|
||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vx = (__vector float *) x;
|
||||
BLASLONG i=0;
|
||||
for (; i < n/2; i += 8) {
|
||||
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float vy_2 = vy[i + 2];
|
||||
register __vector float vy_3 = vy[i + 3];
|
||||
register __vector float vy_4 = vy[i + 4];
|
||||
register __vector float vy_5 = vy[i + 5];
|
||||
register __vector float vy_6 = vy[i + 6];
|
||||
register __vector float vy_7 = vy[i + 7];
|
||||
register __vector float vx_0 = vx[i];
|
||||
register __vector float vx_1 = vx[i + 1];
|
||||
register __vector float vx_2 = vx[i + 2];
|
||||
register __vector float vx_3 = vx[i + 3];
|
||||
register __vector float vx_4 = vx[i + 4];
|
||||
register __vector float vx_5 = vx[i + 5];
|
||||
register __vector float vx_6 = vx[i + 6];
|
||||
register __vector float vx_7 = vx[i + 7];
|
||||
vy_0 += vx_0*valpha_r;
|
||||
vy_1 += vx_1*valpha_r;
|
||||
vy_2 += vx_2*valpha_r;
|
||||
vy_3 += vx_3*valpha_r;
|
||||
vy_4 += vx_4*valpha_r;
|
||||
vy_5 += vx_5*valpha_r;
|
||||
vy_6 += vx_6*valpha_r;
|
||||
vy_7 += vx_7*valpha_r;
|
||||
vx_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
vx_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
vx_2 = vec_perm(vx_2, vx_2, swap_mask);
|
||||
vx_3 = vec_perm(vx_3, vx_3, swap_mask);
|
||||
vx_4 = vec_perm(vx_4, vx_4, swap_mask);
|
||||
vx_5 = vec_perm(vx_5, vx_5, swap_mask);
|
||||
vx_6 = vec_perm(vx_6, vx_6, swap_mask);
|
||||
vx_7 = vec_perm(vx_7, vx_7, swap_mask);
|
||||
vy_0 += vx_0*valpha_i;
|
||||
vy_1 += vx_1*valpha_i;
|
||||
vy_2 += vx_2*valpha_i;
|
||||
vy_3 += vx_3*valpha_i;
|
||||
vy_4 += vx_4*valpha_i;
|
||||
vy_5 += vx_5*valpha_i;
|
||||
vy_6 += vx_6*valpha_i;
|
||||
vy_7 += vx_7*valpha_i;
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vy[i + 2] = vy_2;
|
||||
vy[i + 3] = vy_3;
|
||||
vy[i + 4] = vy_4;
|
||||
vy[i + 5] = vy_5 ;
|
||||
vy[i + 6] = vy_6 ;
|
||||
vy[i + 7] = vy_7 ;
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
if (n <= 0) return (0);
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1) {
|
||||
caxpy_kernel_16(n1, x, y, da_r,da_i);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while (i < n) {
|
||||
#if !defined(CONJ)
|
||||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
i++;
|
||||
ix += 2;
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
while (i < n) {
|
||||
#if !defined(CONJ)
|
||||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "ccopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
/*Copyright (c) 2013-201\n8, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
||||
{
|
||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vx = (__vector float *) x;
|
||||
BLASLONG i = 0;
|
||||
register __vector float vd_0 = { 0 };
|
||||
register __vector float vd_1 = { 0 };
|
||||
register __vector float vd_2 = { 0 };
|
||||
register __vector float vd_3 = { 0 };
|
||||
register __vector float vdd_0 = { 0 };
|
||||
register __vector float vdd_1 = { 0 };
|
||||
register __vector float vdd_2 = { 0 };
|
||||
register __vector float vdd_3 = { 0 };
|
||||
for (; i < n/2; i += 4) {
|
||||
|
||||
register __vector float vyy_0 ;
|
||||
register __vector float vyy_1 ;
|
||||
register __vector float vyy_2 ;
|
||||
register __vector float vyy_3 ;
|
||||
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float vy_2 = vy[i + 2];
|
||||
register __vector float vy_3 = vy[i + 3];
|
||||
register __vector float vx_0= vx[i];
|
||||
register __vector float vx_1 = vx[i + 1];
|
||||
register __vector float vx_2 = vx[i + 2];
|
||||
register __vector float vx_3 = vx[i + 3];
|
||||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
|
||||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
|
||||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
|
||||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
|
||||
|
||||
vd_0 += vx_0 * vy_0;
|
||||
vd_1 += vx_1 * vy_1;
|
||||
vd_2 += vx_2 * vy_2;
|
||||
vd_3 += vx_3 * vy_3;
|
||||
|
||||
vdd_0 += vx_0 * vyy_0;
|
||||
vdd_1 += vx_1 * vyy_1;
|
||||
vdd_2 += vx_2 * vyy_2;
|
||||
vdd_3 += vx_3 * vyy_3;
|
||||
|
||||
|
||||
}
|
||||
//aggregate
|
||||
vd_0 = vd_0 + vd_1 +vd_2 +vd_3;
|
||||
vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3;
|
||||
//reverse and aggregate
|
||||
vd_1=vec_xxpermdi(vd_0,vd_0,2) ;
|
||||
vdd_1=vec_xxpermdi(vdd_0,vdd_0,2);
|
||||
vd_2=vd_0+vd_1;
|
||||
vdd_2=vdd_0+vdd_1;
|
||||
|
||||
dot[0]=vd_2[0];
|
||||
dot[1]=vd_2[1];
|
||||
dot[2]=vdd_2[0];
|
||||
dot[3]=vdd_2[1];
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix=0, iy=0;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
if (n <= 0) {
|
||||
CREAL(result) = 0.0;
|
||||
CIMAG(result) = 0.0;
|
||||
return (result);
|
||||
|
||||
}
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
BLASLONG j=0;
|
||||
|
||||
if (n1){
|
||||
cdot_kernel_8(n1, x, y, dot);
|
||||
i = n1;
|
||||
j = n1 <<1;
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[j] * y[j];
|
||||
dot[1] += x[j + 1] * y[j + 1];
|
||||
dot[2] += x[j] * y[j + 1];
|
||||
dot[3] += x[j + 1] * y[j];
|
||||
|
||||
j += 2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
i = 0;
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[ix] * y[iy];
|
||||
dot[1] += x[ix + 1] * y[iy + 1];
|
||||
dot[2] += x[ix] * y[iy + 1];
|
||||
dot[3] += x[ix + 1] * y[iy];
|
||||
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
CREAL(result) = dot[0] - dot[1];
|
||||
CIMAG(result) = dot[2] + dot[3];
|
||||
#else
|
||||
CREAL(result) = dot[0] + dot[1];
|
||||
CIMAG(result) = dot[2] - dot[3];
|
||||
|
||||
#endif
|
||||
|
||||
return (result);
|
||||
|
||||
}
|
|
@ -0,0 +1,585 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
#define NBMAX 1024
|
||||
|
||||
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
|
||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
|
||||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
|
||||
register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
|
||||
register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
|
||||
register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
|
||||
register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
|
||||
#else
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
|
||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
|
||||
register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
|
||||
register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
|
||||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
|
||||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
register __vector float *vptr_a2 = (__vector float *) a2;
|
||||
register __vector float *vptr_a3 = (__vector float *) a3;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va2 = vptr_a2[i];
|
||||
register __vector float va3 = vptr_a3[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
register __vector float va2_1 = vptr_a2[i + 1];
|
||||
register __vector float va3_1 = vptr_a3[i + 1];
|
||||
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
|
||||
va0 = vec_perm(va0, va0,swap_mask);
|
||||
va0_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
va1 = vec_perm(va1, va1,swap_mask);
|
||||
va1_1 = vec_perm(va1_1, va1_1,swap_mask);
|
||||
va2 = vec_perm(va2, va2,swap_mask);
|
||||
va2_1 = vec_perm(va2_1, va2_1,swap_mask);
|
||||
va3 = vec_perm(va3, va3,swap_mask);
|
||||
va3_1 = vec_perm(va3_1, va3_1,swap_mask);
|
||||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
|
||||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
|
||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
|
||||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
|
||||
#else
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
|
||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
register __vector float va1x = vec_perm(va1, va1,swap_mask);
|
||||
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
|
||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
|
||||
#else
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) ap;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
vy_0 += va0*vx0_r + va0x*vx0_i;
|
||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
|
||||
if (inc_dest != 2) {
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest+1) += temp_i;
|
||||
|
||||
src+=2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if !defined(XCONJ)
|
||||
|
||||
register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
|
||||
register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||
|
||||
#else
|
||||
register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
|
||||
register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
|
||||
#endif
|
||||
|
||||
register __vector float *vptr_src = (__vector float *) src;
|
||||
register __vector float *vptr_y = (__vector float *) dest;
|
||||
for (i = 0; i < n/2; i += 2 ){
|
||||
|
||||
register __vector float vy_0 = vptr_y[i];
|
||||
register __vector float vy_1 = vptr_y[i +1];
|
||||
|
||||
register __vector float vsrc = vptr_src[i];
|
||||
register __vector float vsrc_1 = vptr_src[i + 1];
|
||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
|
||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
|
||||
|
||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
|
||||
vptr_y[i] = vy_0;
|
||||
vptr_y[i+1 ] = vy_1;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT xbuffer[8], *ybuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
lda *= 2;
|
||||
|
||||
n1 = n / 4;
|
||||
n2 = n % 4;
|
||||
|
||||
m3 = m % 4;
|
||||
m1 = m - (m % 4);
|
||||
m2 = (m % NBMAX) - (m % 4);
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
|
||||
x_ptr = x;
|
||||
|
||||
memset(ybuffer, 0, NB * 2*sizeof(FLOAT));
|
||||
|
||||
if (inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
|
||||
x_ptr += 4;
|
||||
a_ptr += 2 * lda;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
||||
x_ptr += 2;
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
xbuffer[3] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[4] = x_ptr[0];
|
||||
xbuffer[5] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[6] = x_ptr[0];
|
||||
xbuffer[7] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
|
||||
a += 2 * NB;
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
if (m3 == 1) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r = 0.0;
|
||||
FLOAT temp_i = 0.0;
|
||||
|
||||
if (lda == 2 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
|
||||
if (lda == 4 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 3) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
FLOAT temp_r2 = 0.0;
|
||||
FLOAT temp_i2 = 0.0;
|
||||
|
||||
if (lda == 6 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 6;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
|
@ -0,0 +1,571 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 1024
|
||||
#include <altivec.h>
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* va2 = (__vector float*) a2;
|
||||
__vector float* va3 = (__vector float*) a3;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
|
||||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1];
|
||||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
|
||||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1];
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
|
||||
|
||||
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
|
||||
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
|
||||
|
||||
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
|
||||
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
|
||||
|
||||
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
|
||||
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
|
||||
|
||||
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
|
||||
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
|
||||
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
|
||||
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
|
||||
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) ap;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest = *src;
|
||||
*(dest + 1) = *(src + 1);
|
||||
dest += 2;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
lda <<= 1;
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 2;
|
||||
n2 = n & 3;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 2)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
if (inc_y == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda << 2;
|
||||
y_ptr += 8;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda;
|
||||
y_ptr += 2;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
memset(ybuffer, 0, sizeof (ybuffer));
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[2];
|
||||
y_ptr[1] += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[4];
|
||||
y_ptr[1] += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[6];
|
||||
y_ptr[1] += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
memset(ybuffer, 0, sizeof (ybuffer));
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
|
||||
a_ptr += lda;
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
a += 2 * NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
j = 0;
|
||||
a_ptr = a;
|
||||
y_ptr = y;
|
||||
|
||||
if (m3 == 3) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x4 = x_ptr[0];
|
||||
FLOAT x5 = x_ptr[1];
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 1) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
|
||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||
{
|
||||
__vector float t0;
|
||||
__vector float t1;
|
||||
__vector float t2;
|
||||
__vector float t3;
|
||||
__vector float t4;
|
||||
__vector float t5;
|
||||
__vector float t6;
|
||||
__vector float t7;
|
||||
__asm__
|
||||
(
|
||||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words
|
||||
"xxspltw 36, 36, 0 \n\t"
|
||||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words
|
||||
"xxspltw 37, 37, 0 \n\t"
|
||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t"
|
||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t"
|
||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y
|
||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t"
|
||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t"
|
||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t"
|
||||
"addi %[x_ptr], %[x_ptr], 64 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
||||
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
||||
"ble 2f \n\t"
|
||||
".p2align 5 \n\t"
|
||||
"1: \n\t"
|
||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||
"xvmulsp 41, 33, 36 \n\t"
|
||||
"xvmulsp 42, 34, 36 \n\t"
|
||||
"xvmulsp 43, 35, 36 \n\t"
|
||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y
|
||||
"xvmulsp %x[x2], 49, 36 \n\t"
|
||||
"xvmulsp %x[x1], 50, 36 \n\t"
|
||||
"xvmulsp %x[x3], 51, 36 \n\t"
|
||||
"xvmulsp 44, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 45, 33, 37 \n\t"
|
||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
||||
"xvmulsp 46, 34, 37 \n\t"
|
||||
"xvmulsp 47, 35, 37 \n\t"
|
||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t"
|
||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t"
|
||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y
|
||||
"xvmulsp %x[x5], 49, 37 \n\t"
|
||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y
|
||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t"
|
||||
"xvmulsp %x[x6], 50, 37 \n\t"
|
||||
"xvmulsp %x[x7], 51, 37 \n\t"
|
||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t"
|
||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t"
|
||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y
|
||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y
|
||||
"addi %[x_ptr], %[x_ptr], -64 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], -64 \n\t"
|
||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y
|
||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y
|
||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x
|
||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x
|
||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x
|
||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x
|
||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x
|
||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t"
|
||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t"
|
||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t"
|
||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y
|
||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t"
|
||||
"addi %[x_ptr], %[x_ptr], 128 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
||||
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
"2: \n\t"
|
||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||
"xvmulsp 41, 33, 36 \n\t"
|
||||
"xvmulsp 42, 34, 36 \n\t"
|
||||
"xvmulsp 43, 35, 36 \n\t"
|
||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y
|
||||
"xvmulsp %x[x2], 49, 36 \n\t"
|
||||
"xvmulsp %x[x1], 50, 36 \n\t"
|
||||
"xvmulsp %x[x3], 51, 36 \n\t"
|
||||
"xvmulsp 44, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 45, 33, 37 \n\t"
|
||||
"xvmulsp 46, 34, 37 \n\t"
|
||||
"xvmulsp 47, 35, 37 \n\t"
|
||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y
|
||||
"xvmulsp %x[x5], 49, 37 \n\t"
|
||||
"xvmulsp %x[x6], 50, 37 \n\t"
|
||||
"xvmulsp %x[x7], 51, 37 \n\t"
|
||||
"addi %[x_ptr], %[x_ptr], -64 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], -64 \n\t"
|
||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y
|
||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y
|
||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y
|
||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y
|
||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x
|
||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x
|
||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x
|
||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x
|
||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x
|
||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t"
|
||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t"
|
||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t"
|
||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y
|
||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x3], %[i48], %[y_ptr] "
|
||||
:
|
||||
[mem_x] "+m" (*(float (*)[2*n])x),
|
||||
[mem_y] "+m" (*(float (*)[2*n])y),
|
||||
[temp_n] "+r" (n),
|
||||
[x_ptr] "+&b" (x),
|
||||
[y_ptr] "+&b" (y),
|
||||
[x0] "=wa" (t0),
|
||||
[x1] "=wa" (t2),
|
||||
[x2] "=wa" (t1),
|
||||
[x3] "=wa" (t3),
|
||||
[x4] "=wa" (t4),
|
||||
[x5] "=wa" (t5),
|
||||
[x6] "=wa" (t6),
|
||||
[x7] "=wa" (t7)
|
||||
:
|
||||
[cos] "f" (c),
|
||||
[sin] "f" (s),
|
||||
[i16] "b" (16),
|
||||
[i32] "b" (32),
|
||||
[i48] "b" (48)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
crot_kernel_8(n1, x, y, c, s);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[ix] ;
|
||||
temp[1] = c*x[ix+1] + s*y[ix+1] ;
|
||||
y[ix] = c*y[ix] - s*x[ix] ;
|
||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[iy] ;
|
||||
temp[1] = c*x[ix+1] + s*y[iy+1] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "cswap_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dasum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "daxpy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dcopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "ddot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,249 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
|
||||
#define LOAD ld
|
||||
|
||||
|
||||
|
||||
|
||||
#define STACKSIZE (512 )
|
||||
#define ALPHA_SP (296+192)(SP)
|
||||
#define FZERO (304+192)(SP)
|
||||
|
||||
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#define A r7
|
||||
#define B r8
|
||||
#define C r9
|
||||
#define LDC r10
|
||||
#define OFFSET r6
|
||||
|
||||
|
||||
|
||||
#define alpha_r vs18
|
||||
|
||||
#define o0 0
|
||||
|
||||
|
||||
#define T4 r12
|
||||
#define T3 r11
|
||||
#define C4 r14
|
||||
#define o8 r15
|
||||
#define o24 r16
|
||||
#define C2 r17
|
||||
#define L r18
|
||||
#define T1 r19
|
||||
#define C3 r20
|
||||
#define TEMP_REG r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o16 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T2 r31
|
||||
|
||||
#include "dgemm_macros_power9.S"
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv v20, 288(SP)
|
||||
stxv v21, 304(SP)
|
||||
stxv v22, 320(SP)
|
||||
stxv v23, 336(SP)
|
||||
stxv v24, 352(SP)
|
||||
stxv v25, 368(SP)
|
||||
stxv v26, 384(SP)
|
||||
stxv v27, 400(SP)
|
||||
stxv v28, 416(SP)
|
||||
stxv v29, 432(SP)
|
||||
stxv v30, 448(SP)
|
||||
stxv v31, 464(SP)
|
||||
|
||||
|
||||
stfd f1, ALPHA_SP
|
||||
stw r0, FZERO
|
||||
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble .L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble .L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble .L999_H1
|
||||
|
||||
|
||||
|
||||
addi T1, SP, 296+192
|
||||
|
||||
|
||||
li PRE, 384
|
||||
li o8 , 8
|
||||
li o16, 16
|
||||
li o24, 24
|
||||
li o32, 32
|
||||
li o48, 48
|
||||
|
||||
|
||||
lxvdsx alpha_r, 0, T1
|
||||
|
||||
#include "dgemm_logic_power9.S"
|
||||
|
||||
.L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
lxv v20, 288(SP)
|
||||
lxv v21, 304(SP)
|
||||
lxv v22, 320(SP)
|
||||
lxv v23, 336(SP)
|
||||
lxv v24, 352(SP)
|
||||
lxv v25, 368(SP)
|
||||
lxv v26, 384(SP)
|
||||
lxv v27, 400(SP)
|
||||
lxv v28, 416(SP)
|
||||
lxv v29, 432(SP)
|
||||
lxv v30, 448(SP)
|
||||
lxv v31, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dgemv_n_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue