Revert "rebase? (#1)"

This reverts commit 2a669e79e1.
This commit is contained in:
TiborGY 2019-07-07 13:06:18 +02:00
parent 2a669e79e1
commit ef0499fd5b
453 changed files with 9682 additions and 54444 deletions

View File

@ -1,143 +0,0 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest

View File

@ -25,15 +25,6 @@ matrix:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
@ -169,10 +160,45 @@ matrix:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
osx_image: xcode8.3
env:
- BTYPE="BINARY=32"
- &emulated-arm
dist: trusty
sudo: required
services: docker
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
name: "Emulated Build for ARMV6 with gcc"
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
script: |
echo "FROM openblas/alpine:${IMAGE_ARCH}
COPY . /tmp/openblas
RUN mkdir /tmp/openblas/build && \
cd /tmp/openblas/build && \
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
-D TARGET=${TARGET_ARCH} \
-D BUILD_SHARED_LIBS=ON \
-D BUILD_WITHOUT_LAPACK=ON \
-D BUILD_WITHOUT_CBLAS=ON \
-D CMAKE_BUILD_TYPE=Release ../ && \
cmake --build ." > Dockerfile
docker build .
- <<: *emulated-arm
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
name: "Emulated Build for ARMV6 with clang"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
name: "Emulated Build for ARMV8 with gcc"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
name: "Emulated Build for ARMV8 with clang"
allow_failures:
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
# whitelist
branches:
only:

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 7.dev)
set(OpenBLAS_PATCH_VERSION 6.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -20,14 +20,9 @@ if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
@ -47,19 +42,6 @@ endif()
#######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
@ -80,10 +62,10 @@ endif ()
set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src)
endif()
list(APPEND SUBDIRS lapack)
endif ()
# set which float types we want to build for
@ -152,7 +134,7 @@ endif ()
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@ -167,9 +149,15 @@ if (${DYNAMIC_ARCH})
endforeach()
endif ()
# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
# Android needs to explicitly link against libm
if(ANDROID)
@ -178,7 +166,7 @@ endif()
# Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else()
# Creates verbose .def file (51KB vs 18KB)
@ -211,8 +199,7 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
if (MSVC OR NOT NOFORTRAN)
# Broken without fortran on unix
add_subdirectory(utest)
endif()
@ -230,14 +217,6 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
@ -335,7 +314,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@ -348,11 +327,10 @@ endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
endif()
if(NOT NO_LAPACKE)

View File

@ -167,7 +167,4 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel

View File

@ -1,82 +1,4 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.6
29-Apr-2019
common:
* the build tools now check that a given cpu TARGET is actually valid
* the build-time check of system features (c_check) has been made
less dependent on particular perl features (this should mainly
benefit building on Windows)
* several problem with the ReLAPACK integration were fixed,
including INTERFACE64 support and building a shared library
* building with CMAKE on BSD systems was improved
* a non-absolute SUM function was added based on the
existing optimized code for ASUM
* CBLAS interfaces to the IxMIN and IxMAX functions were added
* a name clash between LAPACKE and BOOST headers was resolved
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
kernels
* a crash on thread (key) deletion with the USE_TLS=1 memory management
option was fixed
* restored several earlier fixes, in particular for OpenMP performance,
building on BSD, and calling fork on CYGWIN, which had inadvertently
been dropped in the 0.3.3 rewrite of the memory management code.
x86_64:
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
* building with old versions of MSVC was fixed
* it is now possible to build a static library on Windows with CMAKE
* accessing environment variables on CYGWIN at run time was fixed
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
with CMAKE as well
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
* assembly bugs involving undeclared modification of input operands were fixed
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
* a similar bug was fixed in the blas_quickdivide code used to split workloads
in most functions
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
environment does not support AVX512
* improved GEMM performance on ZEN targets
x86:
* build failures caused by the recently added checks for AVX512 were fixed
* an inline assembly bug involving undeclared modification of an input argument was
fixed in the blas_quickdivide code used to split workloads in most functions
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
MIPS32:
* a bug in the IMIN implementation made it return the result of IMAX
POWER:
* single precision BLAS1/2 functions have received optimized POWER8 kernels
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
* building on PPC970 systems under OSX Leopard or Tiger is now supported
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
* building a shared library on AIX is now supported for POWER6
* DYNAMIC_ARCH support has been added for POWER6 and newer
ARMv7:
* corrected xDOT behaviour with zero INC_X or INC_Y
* a bug in the IMIN implementation made it return the result of IMAX
ARMv8:
* added support for HiSilicon TSV110 cpus
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* cross-compilation with CMAKE now works again
* a bug in the IMIN implementation made it return the result of IMAX
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
IBM Z:
* optimized microkernels for single precicion BLAS1/2 functions have been added
for both Z13 and Z14
====================================================================
Version 0.3.5
31-Dec-2018

View File

@ -34,7 +34,7 @@ endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@ -96,7 +96,7 @@ endif
@echo
shared :
ifneq ($(NO_SHARED), 1)
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ -123,13 +123,10 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
endif
$(MAKE) -C utest all
endif
ifndef NO_CBLAS
$(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif
endif

View File

@ -38,8 +38,3 @@ ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif

View File

@ -58,14 +58,14 @@ ifndef NO_LAPACKE
endif
#for install static library
ifneq ($(NO_STATIC),1)
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@ -106,14 +106,14 @@ ifndef NO_LAPACKE
endif
#for install static library
ifneq ($(NO_STATIC),1)
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@ -138,7 +138,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifneq ($(NO_SHARED),1)
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -9,15 +9,7 @@ else
USE_OPENMP = 1
endif
ifeq ($(CORE), POWER9)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
endif
endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
@ -29,10 +21,6 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif
endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.7.dev
VERSION = 0.3.6.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -58,12 +58,6 @@ VERSION = 0.3.7.dev
# For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
@ -163,10 +157,6 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@ -191,17 +181,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this number by THREAD_TIMEOUT
# system). Also you can control this mumber by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contiguous memory
# Using special device driver for mapping physically contigous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
@ -209,7 +199,7 @@ NO_AFFINITY = 1
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need sanity check by comparing results to reference BLAS. It'll be very
# If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet).
# SANITY_CHECK = 1
@ -249,21 +239,6 @@ COMMON_PROF = -pg
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
#
# End of user configuration
#

View File

@ -9,11 +9,6 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture.
ifndef ARCH
ARCH := $(shell uname -m)
endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
@ -142,12 +137,7 @@ endif
endif
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
ifeq ($(ARCH), x86_64)
ifneq ($(C_COMPILER), PGI)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@ -165,8 +155,7 @@ GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(NO_AVX2), 1)
@ -247,10 +236,6 @@ SMP = 1
endif
endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC
NEED_PIC = 1
endif
@ -402,12 +387,6 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
#
# Architecture dependent settings
#
@ -548,12 +527,6 @@ DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
endif
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
DYNAMIC_CORE += POWER9
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE
override DYNAMIC_ARCH=
@ -764,8 +737,6 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
@ -1071,7 +1042,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifeq ($(USE_TLS), 1)
ifdef USE_TLS
CCOMMON_OPT += -DUSE_TLS
endif

View File

@ -28,15 +28,11 @@ endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif

View File

@ -4,7 +4,3 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@ -6,13 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@ -24,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
@ -65,7 +63,9 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@ -115,7 +115,6 @@ Please read `GotoBLAS_01Readme.txt`.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@ -134,13 +133,11 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS

View File

@ -48,7 +48,6 @@ POWER5
POWER6
POWER7
POWER8
POWER9
PPCG4
PPC970
PPC970MP
@ -91,9 +90,7 @@ CORTEXA73
FALKOR
THUNDERX
THUNDERX2T99
TSV110
9.System Z:
ZARCH_GENERIC
Z13
Z14

View File

@ -35,14 +35,7 @@ environment:
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- COMPILER: MinGW64-gcc-7.2.0
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@ -59,17 +52,10 @@ install:
before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
build_script:
- cmake --build .
@ -78,4 +64,3 @@ test_script:
- echo Running Test
- cd utest
- openblas_utest

View File

@ -1,51 +0,0 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {

View File

@ -2,8 +2,6 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@ -21,6 +19,7 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@ -28,21 +27,29 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), nrow = n)
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@ -2,8 +2,6 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@ -21,6 +19,7 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@ -28,13 +27,26 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n), nrow = n)
B <- matrix(runif(n * n), nrow = n)
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
C <- 1
z <- system.time(for (l in 1:loops) {
@ -42,10 +54,11 @@ while (n <= nto) {
l <- l + 1
})
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@ -2,8 +2,6 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@ -21,6 +19,7 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@ -28,22 +27,31 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), nrow = n)
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
z <- system.time(for (l in 1:loops) {
solve(A, B)
})
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

85
c_check
View File

@ -1,7 +1,7 @@
#!/usr/bin/perl
#use File::Basename;
# use File::Temp qw(tempfile);
use File::Basename;
use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
#$tmpf = new File::Temp( UNLINK => 1 );
$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@ -31,25 +31,12 @@ if ($?) {
$cross_suffix = "";
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
$compiler = "";
@ -184,26 +171,20 @@ if ($?) {
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$tmpf = new File::Temp( UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
$architecture = x86 if ($data =~ /ARCH_X86/);
@ -223,25 +204,17 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("$tmpf.o");
$no_avx512 = 0;
}
unlink("tmpf.o");
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;

15
cblas.h
View File

@ -73,11 +73,6 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
@ -93,16 +88,6 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@ -73,16 +73,11 @@ if (DYNAMIC_ARCH)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
unset(DYNAMIC_ARCH)
endif ()
endif ()

View File

@ -44,10 +44,7 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)
@ -107,12 +107,6 @@ macro(SetDefaultL1)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
endmacro ()
macro(SetDefaultL2)
@ -168,4 +162,4 @@ macro(SetDefaultL3)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
endmacro ()
endmacro ()

View File

@ -8,11 +8,6 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
set(EXTRALIB "${EXTRALIB} -lm")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
set(EXTRALIB "${EXTRALIB} -lm")
endif ()

View File

@ -59,9 +59,6 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_")
endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
@ -85,11 +82,6 @@ endif ()
# f_check
if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif ()
# Cannot run getarch on target if we are cross-compiling

View File

@ -65,18 +65,6 @@ if (DEFINED TARGET)
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@ -148,16 +136,10 @@ endif ()
if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
@ -174,9 +156,6 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif ()
if (BINARY64)
@ -202,24 +181,12 @@ if (NEED_PIC)
endif ()
if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
endif ()
if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif ()
if (NO_LAPACK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
#Disable LAPACK C interface
@ -309,7 +276,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: need to convert these Makefiles
# TODO: nead to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")

View File

@ -39,21 +39,13 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
set(X86_64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
set(ARM 1)
endif()
set(ARM64 1)
endif()
if (X86_64)
@ -86,7 +78,7 @@ endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()

View File

@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@ -85,8 +85,6 @@ extern "C" {
#if !defined(_MSC_VER)
#include <unistd.h>
#elif _MSC_VER < 1900
#define snprintf _snprintf
#endif
#include <time.h>
@ -131,7 +129,7 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#include <math.h>
#if defined(SMP) || defined(USE_LOCKING)
#ifdef SMP
#include <pthread.h>
#endif
#endif
@ -200,7 +198,7 @@ extern "C" {
#error "You can't specify both LOCK operation!"
#endif
#if defined(SMP) || defined(USE_LOCKING)
#ifdef SMP
#define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK
#endif
@ -350,11 +348,6 @@ typedef int blasint;
#endif
#endif
#ifdef POWER9
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/*
#ifdef PILEDRIVER
@ -446,7 +439,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else

View File

@ -19,7 +19,6 @@
#define CDOTC_K cdotc_k
#define CNRM2_K cnrm2_k
#define CSCAL_K cscal_k
#define CSUM_K csum_k
#define CSWAP_K cswap_k
#define CROT_K csrot_k
@ -250,7 +249,6 @@
#define CDOTC_K gotoblas -> cdotc_k
#define CNRM2_K gotoblas -> cnrm2_k
#define CSCAL_K gotoblas -> cscal_k
#define CSUM_K gotoblas -> csum_k
#define CSWAP_K gotoblas -> cswap_k
#define CROT_K gotoblas -> csrot_k

View File

@ -19,7 +19,6 @@
#define DDOTC_K ddot_k
#define DNRM2_K dnrm2_k
#define DSCAL_K dscal_k
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k
@ -175,7 +174,6 @@
#define DDOTC_K gotoblas -> ddot_k
#define DNRM2_K gotoblas -> dnrm2_k
#define DSCAL_K gotoblas -> dscal_k
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k

View File

@ -122,13 +122,6 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);

View File

@ -100,13 +100,6 @@ float casum_k (BLASLONG, float *, BLASLONG);
double zasum_k (BLASLONG, double *, BLASLONG);
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
float ssum_k (BLASLONG, float *, BLASLONG);
double dsum_k (BLASLONG, double *, BLASLONG);
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
float csum_k (BLASLONG, float *, BLASLONG);
double zsum_k (BLASLONG, double *, BLASLONG);
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
float samax_k (BLASLONG, float *, BLASLONG);
double damax_k (BLASLONG, double *, BLASLONG);
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);

View File

@ -66,7 +66,6 @@
#define DOTC_K QDOTC_K
#define NRM2_K QNRM2_K
#define SCAL_K QSCAL_K
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K
@ -357,7 +356,6 @@
#define DOTC_K DDOTC_K
#define NRM2_K DNRM2_K
#define SCAL_K DSCAL_K
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K
@ -660,7 +658,6 @@
#define DOTC_K SDOTC_K
#define NRM2_K SNRM2_K
#define SCAL_K SSCAL_K
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K
@ -965,7 +962,6 @@
#define DOTC_K XDOTC_K
#define NRM2_K XNRM2_K
#define SCAL_K XSCAL_K
#define SUM_K XSUM_K
#define SWAP_K XSWAP_K
#define ROT_K XROT_K
@ -1367,7 +1363,6 @@
#define DOTC_K ZDOTC_K
#define NRM2_K ZNRM2_K
#define SCAL_K ZSCAL_K
#define SUM_K ZSUM_K
#define SWAP_K ZSWAP_K
#define ROT_K ZROT_K
@ -1790,7 +1785,6 @@
#define DOTC_K CDOTC_K
#define NRM2_K CNRM2_K
#define SCAL_K CSCAL_K
#define SUM_K CSUM_K
#define SWAP_K CSWAP_K
#define ROT_K CROT_K

View File

@ -63,7 +63,6 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG);
float (*ssum_k) (BLASLONG, float *, BLASLONG);
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -155,7 +154,6 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG);
double (*dsum_k) (BLASLONG, double *, BLASLONG);
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@ -247,7 +245,6 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@ -335,7 +332,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -499,7 +495,6 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
double (*zasum_k) (BLASLONG, double *, BLASLONG);
double (*zsum_k) (BLASLONG, double *, BLASLONG);
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -665,7 +660,6 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);

View File

@ -39,7 +39,7 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#if defined(POWER8) || defined(POWER9)
#if defined(POWER8)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")
#else
@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8) || defined(POWER9)
#if defined(POWER8)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifdef OS_LINUX
#ifndef __64BIT__
#define PROLOGUE \
.section .text;\
@ -598,14 +598,9 @@ REALNAME:;\
#ifndef __64BIT__
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\
.REALNAME:
.REALNAME:;
#define EPILOGUE \
_section_.text:;\
@ -616,14 +611,9 @@ _section_.text:;\
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\
.REALNAME:
.REALNAME:;
#define EPILOGUE \
_section_.text:;\
@ -784,7 +774,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifdef OS_LINUX
#if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1
@ -812,7 +802,7 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) || defined(POWER9)
#elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
@ -829,7 +819,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifdef OS_LINUX
#ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8)
#else

View File

@ -19,7 +19,6 @@
#define QDOTC_K qdot_k
#define QNRM2_K qnrm2_k
#define QSCAL_K qscal_k
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k
@ -162,7 +161,6 @@
#define QDOTC_K gotoblas -> qdot_k
#define QNRM2_K gotoblas -> qnrm2_k
#define QSCAL_K gotoblas -> qscal_k
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k

View File

@ -12,7 +12,6 @@
#define ISMAX_K ismax_k
#define ISMIN_K ismin_k
#define SASUM_K sasum_k
#define SSUM_K ssum_k
#define SAXPYU_K saxpy_k
#define SAXPYC_K saxpy_k
#define SCOPY_K scopy_k
@ -171,7 +170,6 @@
#define ISMAX_K gotoblas -> ismax_k
#define ISMIN_K gotoblas -> ismin_k
#define SASUM_K gotoblas -> sasum_k
#define SSUM_K gotoblas -> ssum_k
#define SAXPYU_K gotoblas -> saxpy_k
#define SAXPYC_K gotoblas -> saxpy_k
#define SCOPY_K gotoblas -> scopy_k

View File

@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Choosing a SIZE too small will lead to a stack smashing.
* Chosing a too small SIZE will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \

View File

@ -19,7 +19,6 @@
#define XDOTC_K xdotc_k
#define XNRM2_K xnrm2_k
#define XSCAL_K xscal_k
#define XSUM_K xsum_k
#define XSWAP_K xswap_k
#define XROT_K xqrot_k
@ -228,7 +227,6 @@
#define XDOTC_K gotoblas -> xdotc_k
#define XNRM2_K gotoblas -> xnrm2_k
#define XSCAL_K gotoblas -> xscal_k
#define XSUM_K gotoblas -> xsum_k
#define XSWAP_K gotoblas -> xswap_k
#define XROT_K gotoblas -> xqrot_k

View File

@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
return result;
#endif
@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimization for barcelona.
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -129,13 +129,12 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2];
*edx=cpuinfo[3];
#else
__asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
__asm__ __volatile__("cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (op));
: "0" (op), "c"(0));
#endif
}
@ -211,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
return result;
}
@ -277,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimization for barcelona.
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -19,7 +19,6 @@
#define ZDOTC_K zdotc_k
#define ZNRM2_K znrm2_k
#define ZSCAL_K zscal_k
#define ZSUM_K zsum_k
#define ZSWAP_K zswap_k
#define ZROT_K zdrot_k
@ -250,7 +249,6 @@
#define ZDOTC_K gotoblas -> zdotc_k
#define ZNRM2_K gotoblas -> znrm2_k
#define ZSCAL_K gotoblas -> zscal_k
#define ZSUM_K gotoblas -> zsum_k
#define ZSWAP_K gotoblas -> zswap_k
#define ZROT_K gotoblas -> zdrot_k

View File

@ -1,14 +0,0 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@ -1,55 +0,0 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@ -1,92 +0,0 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -1,101 +0,0 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -39,8 +39,6 @@
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9
static char *cpuname[] = {
"UNKNOWN",
@ -51,8 +49,7 @@ static char *cpuname[] = {
"CORTEXA73",
"FALKOR",
"THUNDERX",
"THUNDERX2T99",
"TSV110"
"THUNDERX2T99"
};
static char *cpuname_lower[] = {
@ -64,8 +61,7 @@ static char *cpuname_lower[] = {
"cortexa73",
"falkor",
"thunderx",
"thunderx2t99",
"tsv110"
"thunderx2t99"
};
int get_feature(char *search)
@ -149,9 +145,6 @@ int detect(void)
return CPU_THUNDERX;
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
}
p = (char *) NULL ;
@ -293,21 +286,6 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}

View File

@ -94,7 +94,7 @@ char *corename[] = {
"CELL",
"PPCG4",
"POWER8",
"POWER9"
"POWER8"
};
int detect(void){
@ -124,7 +124,7 @@ int detect(void){
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
@ -156,7 +156,7 @@ int detect(void){
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
return CPUTYPE_POWER5;
@ -180,7 +180,7 @@ int id;
__asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return CPUTYPE_POWER9;
return CPUTYPE_POWER8;
break;
case 0x4d:
case 0x4b: // POWER8/8E

View File

@ -1359,8 +1359,6 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM;
}
break;
@ -1378,9 +1376,9 @@ int get_cpuname(void){
}
break;
case 9:
case 8:
case 8:
switch (model) {
case 14: // Kaby Lake and refreshes
case 14: // Kaby Lake
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())

View File

@ -27,9 +27,9 @@
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
@ -64,8 +64,10 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
return CPU_GENERIC;
}
@ -114,14 +116,7 @@ void get_cpuconfig(void)
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

View File

@ -113,7 +113,7 @@ ARCH_X86
ARCH_X86_64
#endif
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
ARCH_POWER
#endif

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;

View File

@ -18,12 +18,8 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
COMMONOBJS += dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
COMMONOBJS += dynamic_power.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@ -82,12 +78,8 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
/* We need this global for checking if initialization is finished. */
/* We need this grobal for cheking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
/* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */
/* Monitor is a function to see thread's status for every seconds. */
/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER];

View File

@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */
/* Thread server common infomation */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
/* We need this global for checking if initialization is finished. */
/* We need this global for cheking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */
@ -461,18 +461,13 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}

View File

@ -322,7 +322,7 @@ int support_avx2(){
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
#ifndef NO_AVX512
int eax, ebx, ecx, edx;
int ret=0;
@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
//Apollo Lake
if (model == 12) {
return &gotoblas_NEHALEM;
}
return NULL;

View File

@ -1,102 +0,0 @@
#include "common.h"
extern gotoblas_t gotoblas_POWER6;
extern gotoblas_t gotoblas_POWER8;
extern gotoblas_t gotoblas_POWER9;
extern void openblas_warning(int verbose, const char *msg);
static char *corename[] = {
"unknown",
"POWER6",
"POWER8",
"POWER9"
};
#define NUM_CORETYPES 4
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_POWER6) return corename[1];
if (gotoblas == &gotoblas_POWER8) return corename[2];
if (gotoblas == &gotoblas_POWER9) return corename[3];
return corename[0];
}
static gotoblas_t *get_coretype(void) {
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
return &gotoblas_POWER6;
if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8;
if (__builtin_cpu_is("power9"))
return &gotoblas_POWER9;
return NULL;
}
static gotoblas_t *force_coretype(char * coretype) {
int i ;
int found = -1;
char message[128];
for ( i = 0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 1: return (&gotoblas_POWER6);
case 2: return (&gotoblas_POWER8);
case 3: return (&gotoblas_POWER9);
default: return NULL;
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to POWER8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_POWER8;
}
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
/* if number of threads is larger than initial condition */
/* if number of threads is larger than inital condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@ -857,14 +857,7 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums;
int ret;
@ -897,7 +890,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) {
common->num_procs = nums;
} else {
@ -905,11 +898,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,&cpuset)) n++;
if (CPU_ISSET(i,cpusetp)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(&cpuset);
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
}
#endif

View File

@ -198,68 +198,45 @@ int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,&cpuset)) n++;
if (CPU_ISSET(i,cpusetp)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
#endif
return nums;
#else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
#endif
#endif
}
@ -1313,13 +1290,6 @@ void blas_memory_free_nolock(void * map_address) {
free(map_address);
}
#ifdef SMP
void blas_thread_memory_cleanup(void) {
blas_memory_cleanup((void*)get_memory_table());
}
#endif
void blas_shutdown(void){
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
@ -1329,7 +1299,7 @@ void blas_shutdown(void){
/* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key)
#endif
blas_thread_memory_cleanup();
blas_memory_cleanup((void*)get_memory_table());
#ifdef SEEK_ADDRESS
base_address = 0UL;
@ -1559,7 +1529,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
break;
case DLL_THREAD_DETACH:
#if defined(SMP)
blas_thread_memory_cleanup();
blas_memory_cleanup((void*)get_memory_table());
#endif
break;
case DLL_PROCESS_DETACH:
@ -1622,7 +1592,6 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@ -1630,16 +1599,13 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif
#else
/* USE_TLS / COMPILE_TLS not set */
#include <errno.h>
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#ifdef OS_WINDOWS
#define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000
@ -1653,7 +1619,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <stdio.h>
#include <fcntl.h>
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#ifndef OS_WINDOWS
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
@ -1673,7 +1639,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@ -1712,12 +1678,9 @@ void gotoblas_dummy_for_PGI(void) {
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#else
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@ -1741,70 +1704,45 @@ void goto_set_num_threads(int num_threads) {};
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,&cpuset)) n++;
if (CPU_ISSET(i,cpusetp)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
#endif
return nums;
#else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp);
return nums;
#endif
#endif
}
@ -1818,7 +1756,7 @@ int get_num_procs(void) {
return nums;
}
#endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
@ -1855,7 +1793,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#if defined(OS_FREEBSD)
int get_num_procs(void) {
@ -1932,7 +1870,7 @@ void openblas_fork_handler()
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
@ -1945,7 +1883,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@ -1953,11 +1891,11 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
// blas_goto_num = 0;
blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
@ -1969,7 +1907,7 @@ int blas_get_cpu_number(void){
#endif
// blas_omp_num = 0;
blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
@ -1977,7 +1915,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@ -2064,15 +2002,11 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
#ifdef OS_LINUX
@ -2214,18 +2148,14 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
UNLOCK_COMMAND(&alloc_lock);
return map_address;
}
@ -2593,7 +2523,7 @@ void *blas_memory_alloc(int procpos){
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos = 0;
int mypos;
#endif
void *map_address;
@ -2624,11 +2554,6 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
@ -2664,9 +2589,6 @@ void *blas_memory_alloc(int procpos){
}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG
printf("Alloc Start ...\n");
@ -2681,17 +2603,13 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
blas_lock(&memory[position].lock);
#endif
// blas_lock(&memory[position].lock);
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
// blas_unlock(&memory[position].lock);
}
position ++;
@ -2703,26 +2621,21 @@ void *blas_memory_alloc(int procpos){
position = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
#if defined(USE_OPENMP)
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
/* if (!memory[position].used) { */
/* blas_lock(&memory[position].lock);*/
if (!memory[position].used) goto allocation;
#if defined(USE_OPENMP)
blas_unlock(&memory[position].lock);
}
#endif
/* blas_unlock(&memory[position].lock);*/
/* } */
position ++;
} while (position < NUM_BUFFERS);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
UNLOCK_COMMAND(&alloc_lock);
goto error;
allocation :
@ -2732,11 +2645,10 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
/* blas_unlock(&memory[position].lock);*/
if (!memory[position].addr) {
do {
#ifdef DEBUG
@ -2753,7 +2665,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
}
#endif
@ -2781,13 +2693,9 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@ -2841,9 +2749,8 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
@ -2857,9 +2764,7 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@ -2874,9 +2779,8 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
}
@ -2926,7 +2830,7 @@ void blas_shutdown(void){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
#if defined(SMP) || defined(USE_LOCKING)
#ifdef SMP
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
@ -2951,7 +2855,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
if (hot_alloc != 2) {
#endif
#if defined(SMP) || defined(USE_LOCKING)
#ifdef SMP
LOCK_COMMAND(&init_lock);
#endif
@ -2961,7 +2865,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size -= PAGESIZE;
}
#if defined(SMP) || defined(USE_LOCKING)
#ifdef SMP
UNLOCK_COMMAND(&init_lock);
#endif
@ -3194,7 +3098,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@ -3202,7 +3106,6 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif

View File

@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#if defined(_WIN32) && defined(_MSC_VER)
#if _MSC_VER < 1900
#define snprintf _snprintf
#endif
#endif
static char* openblas_config_str=""
"OpenBLAS "
VERSION

View File

@ -141,14 +141,6 @@ else
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
endif
ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
@ -160,7 +152,6 @@ else
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
endif
rm -f linktest

View File

@ -40,25 +40,15 @@
void gotoblas_init(void);
void gotoblas_quit(void);
#if defined(SMP) && defined(USE_TLS)
void blas_thread_memory_cleanup(void);
#endif
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
switch(reason) {
case DLL_PROCESS_ATTACH:
gotoblas_init();
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP) && defined(USE_TLS)
blas_thread_memory_cleanup();
#endif
break;
if (reason == DLL_PROCESS_ATTACH) {
gotoblas_init();
}
if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit();
}
return TRUE;

View File

@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;

View File

@ -637,18 +637,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER8"
#endif
#if defined(FORCE_POWER9)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER9"
#define SUBDIRNAME "power"
#define ARCHCONFIG "-DPOWER9 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "power9"
#define CORENAME "POWER9"
#endif
#ifdef FORCE_PPCG4
#define FORCE
@ -1077,23 +1065,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_TSV110
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "TSV110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTSV110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"
@ -1114,16 +1085,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13"
#endif
#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif
#ifndef FORCE
#ifdef USER_TARGET

View File

@ -12,7 +12,6 @@ set(BLAS1_REAL_ONLY_SOURCES
rotm.c rotmg.c # N.B. these do not have complex counterparts
rot.c
asum.c
sum.c
)
# these will have 'z' prepended for the complex version
@ -24,7 +23,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c
@ -125,7 +124,6 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
endif ()
if (${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@ -134,7 +132,6 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
endif ()
endforeach ()

View File

@ -25,7 +25,7 @@ SBLAS1OBJS = \
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
scopy.$(SUFFIX) sscal.$(SUFFIX) \
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@ -51,7 +51,7 @@ DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
ddot.$(SUFFIX) \
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@ -76,7 +76,7 @@ CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
scamax.$(SUFFIX) icamax.$(SUFFIX) \
scamin.$(SUFFIX) icamin.$(SUFFIX) \
csrot.$(SUFFIX) crotg.$(SUFFIX) \
@ -105,7 +105,7 @@ ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@ -146,7 +146,7 @@ QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qdot.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@ -168,7 +168,7 @@ XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@ -224,7 +224,7 @@ QBLAS3OBJS = \
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@ -263,8 +263,7 @@ CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@ -281,8 +280,7 @@ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@ -302,8 +300,7 @@ CCBLAS1OBJS = \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
cblas_caxpby.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@ -329,9 +326,7 @@ CZBLAS1OBJS = \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
cblas_zaxpby.$(SUFFIX)
CZBLAS2OBJS = \
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
@ -565,24 +560,6 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c $< -o $(@F)
@ -1406,18 +1383,6 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1430,18 +1395,6 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1449,7 +1402,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

View File

@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small input size &
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@ -1,97 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;
PRINT_DEBUG_NAME;
if (n <= 0) return 0;
IDEBUG_START;
FUNCTION_PROFILE_START();
ret = (FLOATRET)SUM_K(n, x, incx);
FUNCTION_PROFILE_END(COMPSIZE, n, n);
IDEBUG_END;
return ret;
}
#else
#ifdef COMPLEX
FLOAT CNAME(blasint n, void *vx, blasint incx){
FLOAT *x = (FLOAT*) vx;
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
#endif
FLOAT ret;
PRINT_DEBUG_CNAME;
if (n <= 0) return 0;
IDEBUG_START;
FUNCTION_PROFILE_START();
ret = SUM_K(n, x, incx);
FUNCTION_PROFILE_END(COMPSIZE, n, n);
IDEBUG_END;
return ret;
}
#endif

View File

@ -218,8 +218,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
nthreads = num_cpu_avail(2);
/* nthreads = num_cpu_avail(2);
FIXME trmv_thread was found to be broken, see issue 1332 */
nthreads = 1;
if (nthreads == 1) {
#endif

View File

@ -204,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
if (side < 0) info = 1;
if (info != 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1);
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}

View File

@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small input size &
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else
nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}

View File

@ -65,7 +65,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})

View File

@ -340,32 +340,6 @@ ifndef XSCALKERNEL
XSCALKERNEL = zscal.S
endif
### SUM ###
ifndef SSUMKERNEL
SSUMKERNEL = sum.S
endif
ifndef DSUMKERNEL
DSUMKERNEL = sum.S
endif
ifndef CSUMKERNEL
CSUMKERNEL = zsum.S
endif
ifndef ZSUMKERNEL
ZSUMKERNEL = zsum.S
endif
ifndef QSUMKERNEL
QSUMKERNEL = sum.S
endif
ifndef XSUMKERNEL
XSUMKERNEL = zsum.S
endif
### SWAP ###
ifndef SSWAPKERNEL
@ -479,7 +453,7 @@ endif
SBLASOBJS += \
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
saxpby_k$(TSUFFIX).$(SUFFIX)
@ -489,32 +463,31 @@ DBLASOBJS += \
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
daxpby_k$(TSUFFIX).$(SUFFIX)
QBLASOBJS += \
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
qsum_k$(TSUFFIX).$(SUFFIX)
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
CBLASOBJS += \
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += \
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
XBLASOBJS += \
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
### AMAX ###
@ -644,7 +617,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
### ASUM ###
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
@ -663,26 +636,6 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
### SUM ###
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
### AXPY ###
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

View File

@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
ifeq ($(CORE), GENERIC)
ifeq ($(TARGET), GENERIC)
USE_TRMM = 1
endif
@ -44,18 +44,10 @@ ifeq ($(CORE), POWER8)
USE_TRMM = 1
endif
ifeq ($(CORE), POWER9)
USE_TRMM = 1
endif
ifeq ($(ARCH), zarch)
USE_TRMM = 1
endif
ifeq ($(CORE), Z14)
USE_TRMM = 1
endif

View File

@ -1,206 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88
#define N $16
#define X $17
#define INCX $18
#define I $19
#define s0 $f0
#define s1 $f1
#define s2 $f10
#define s3 $f11
#define a0 $f12
#define a1 $f13
#define a2 $f14
#define a3 $f15
#define a4 $f16
#define a5 $f17
#define a6 $f18
#define a7 $f19
#define t0 $f20
#define t1 $f21
#define t2 $f22
#define t3 $f23
PROLOGUE
PROFCODE
fclr s0
unop
fclr t0
ble N, $L999
sra N, 3, I
fclr s1
fclr s2
ble I, $L15
LD a0, 0 * SIZE(X)
fclr t1
SXADDQ INCX, X, X
fclr t2
LD a1, 0 * SIZE(X)
fclr t3
SXADDQ INCX, X, X
fclr s3
LD a2, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a3, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a4, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a5, 0 * SIZE(X)
SXADDQ INCX, X, X
lda I, -1(I)
ble I, $L13
.align 4
$L12:
ADD s0, t0, s0
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
fmov a0, t0
lda I, -1(I)
ADD s1, t1, s1
LD a6, 0 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
LD a7, 0 * SIZE(X)
fmov a2, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a0, 0 * SIZE(X)
fmov a3, t3
SXADDQ INCX, X, X
ADD s0, t0, s0
LD a1, 0 * SIZE(X)
fmov a4, t0
SXADDQ INCX, X, X
ADD s1, t1, s1
LD a2, 0 * SIZE(X)
fmov a5, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
LD a3, 0 * SIZE(X)
fmov a6, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a4, 0 * SIZE(X)
fmov a7, t3
SXADDQ INCX, X, X
LD a5, 0 * SIZE(X)
unop
SXADDQ INCX, X, X
bne I, $L12
.align 4
$L13:
ADD s0, t0, s0
LD a6, 0 * SIZE(X)
fmov a0, t0
SXADDQ INCX, X, X
ADD s1, t1, s1
LD a7, 0 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
fmov a2, t2
ADD s3, t3, s3
fmov a3, t3
ADD s0, t0, s0
fmov a4, t0
ADD s1, t1, s1
fmov a5, t1
ADD s2, t2, s2
fmov a6, t2
ADD s3, t3, s3
fmov a7, t3
ADD s1, t1, s1
ADD s2, t2, s2
ADD s3, t3, s3
ADD s0, s1, s0
ADD s2, s3, s2
.align 4
$L15:
and N, 7, I
ADD s0, s2, s0
unop
ble I, $L999
.align 4
$L17:
ADD s0, t0, s0
LD a0, 0 * SIZE(X)
SXADDQ INCX, X, X
fmov a0, t0
lda I, -1(I)
bne I, $L17
.align 4
$L999:
ADD s0, t0, s0
ret
EPILOGUE

View File

@ -1,208 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88
#define N $16
#define X $17
#define INCX $18
#define I $19
#define s0 $f0
#define s1 $f1
#define s2 $f10
#define s3 $f11
#define a0 $f12
#define a1 $f13
#define a2 $f14
#define a3 $f15
#define a4 $f16
#define a5 $f17
#define a6 $f18
#define a7 $f19
#define t0 $f20
#define t1 $f21
#define t2 $f22
#define t3 $f23
PROLOGUE
PROFCODE
fclr s0
unop
fclr t0
addq INCX, INCX, INCX
fclr s1
unop
fclr t1
ble N, $L999
fclr s2
sra N, 2, I
fclr s3
ble I, $L15
LD a0, 0 * SIZE(X)
fclr t2
LD a1, 1 * SIZE(X)
SXADDQ INCX, X, X
LD a2, 0 * SIZE(X)
fclr t3
LD a3, 1 * SIZE(X)
SXADDQ INCX, X, X
LD a4, 0 * SIZE(X)
LD a5, 1 * SIZE(X)
SXADDQ INCX, X, X
lda I, -1(I)
ble I, $L13
.align 4
$L12:
ADD s0, t0, s0
ldl $31, PREFETCHSIZE * SIZE(X)
fmov a0, t0
lda I, -1(I)
ADD s1, t1, s1
LD a6, 0 * SIZE(X)
fmov a1, t1
unop
ADD s2, t2, s2
LD a7, 1 * SIZE(X)
fmov a2, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a0, 0 * SIZE(X)
fmov a3, t3
unop
ADD s0, t0, s0
LD a1, 1 * SIZE(X)
fmov a4, t0
SXADDQ INCX, X, X
ADD s1, t1, s1
LD a2, 0 * SIZE(X)
fmov a5, t1
unop
ADD s2, t2, s2
LD a3, 1 * SIZE(X)
fmov a6, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a4, 0 * SIZE(X)
fmov a7, t3
unop
LD a5, 1 * SIZE(X)
unop
SXADDQ INCX, X, X
bne I, $L12
.align 4
$L13:
ADD s0, t0, s0
LD a6, 0 * SIZE(X)
fmov a0, t0
ADD s1, t1, s1
LD a7, 1 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
fmov a2, t2
ADD s3, t3, s3
fmov a3, t3
ADD s0, t0, s0
fmov a4, t0
ADD s1, t1, s1
fmov a5, t1
ADD s2, t2, s2
fmov a6, t2
ADD s3, t3, s3
fmov a7, t3
ADD s2, t2, s2
ADD s3, t3, s3
.align 4
$L15:
ADD s0, s2, s0
and N, 3, I
ADD s1, s3, s1
ble I, $L999
.align 4
$L17:
ADD s0, t0, s0
LD a0, 0 * SIZE(X)
fmov a0, t0
lda I, -1(I)
ADD s1, t1, s1
LD a1, 1 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
bne I, $L17
.align 4
$L999:
ADD s0, t0, s0
ADD s1, t1, s1
ADD s0, s1, s0
ret
EPILOGUE

View File

@ -35,11 +35,6 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c

View File

@ -1,30 +1,30 @@
include $(KERNELDIR)/KERNEL.ARMV5
SAMAXKERNEL = amax_vfp.S
DAMAXKERNEL = amax_vfp.S
#CAMAXKERNEL = amax_vfp.S
#ZAMAXKERNEL = amax_vfp.S
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMINKERNEL = amax_vfp.S
DAMINKERNEL = amax_vfp.S
#CAMINKERNEL = amax_vfp.S
#ZAMINKERNEL = amax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SMAXKERNEL = amax_vfp.S
DMAXKERNEL = amax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMINKERNEL = amax_vfp.S
DMINKERNEL = amax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
#ICAMAXKERNEL = iamax_vfp.S
#IZAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
#ICAMINKERNEL = iamax_vfp.S
#IZAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
@ -37,9 +37,6 @@ DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SSUMKERNEL = sum_vfp.S
DSUMKERNEL = sum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S

View File

@ -1,445 +0,0 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(USE_ABS)
#if defined(DOUBLE)
#define VABS(x0,x1) vabs.f64 x0, x1
#else
#define VABS(x0,x1) vabs.f32 x0, x1
#endif
#else
#define VABS(x0,x1) nop
#endif
/*****************************************************************************************/
#if defined(USE_MIN)
#define MOVCOND movlt
#if defined(DOUBLE)
#define VMOVCOND vmovlt.f64
#else
#define VMOVCOND vmovlt.f32
#endif
#else
#define MOVCOND movgt
#if defined(DOUBLE)
#define VMOVCOND vmovgt.f64
#else
#define VMOVCOND vmovgt.f32
#endif
#endif
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro INIT_F
vldmia.f64 X!, { d0 }
VABS( d0, d0 )
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
.endm
.macro INIT_S
vldmia.f64 X, { d0 }
VABS( d0, d0 )
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 }
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
add X, X, INC_X
.endm
#else
.macro INIT_F
vldmia.f32 X!, { s0 }
VABS( s0, s0 )
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
.endm
.macro INIT_S
vldmia.f32 X, { s0 }
VABS( s0, s0 )
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 }
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro INIT_F
vldmia.f64 X!, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 - d5 }
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
.endm
.macro INIT_S
vldmia.f64 X, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 - d5 }
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
add X, X, INC_X
.endm
#else
.macro INIT_F
vldmia.f32 X!, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 - s5 }
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
.endm
.macro INIT_S
vldmia.f32 X, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 - s5 }
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
movs r12, #0 // clear floating point register
vmov s0, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
#endif
cmp N, #0
ble amax_kernel_L999
cmp INC_X, #0
beq amax_kernel_L999
cmp INC_X, #1
bne amax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
INIT_F
subs N, N , #1
ble amax_kernel_L999
asrs I, N, #2 // I = N / 4
ble amax_kernel_F1
.align 5
amax_kernel_F4:
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
ble amax_kernel_F1
#if defined(COMPLEX) || defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
bne amax_kernel_F4
amax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
b amax_kernel_L999
amax_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
INIT_S
subs N, N , #1
ble amax_kernel_L999
asrs I, N, #2 // I = N / 4
ble amax_kernel_S1
.align 5
amax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
amax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
amax_kernel_L999:
#if !defined(__ARM_PCS_VFP)
#if defined(DOUBLE)
vmov r0, r1, d0
#else
vmov r0, s0
#endif
#endif
bx lr
EPILOGUE

View File

@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
if( x[ix] < minf )
if( x[ix] > minf )
{
min = i;
minf = x[ix];

View File

@ -1,51 +0,0 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* trivial copy of asum.c with the ABS() removed *
**************************************************************************************/
#include "common.h"
#include <math.h>
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
n *= inc_x;
while(i < n)
{
sumf += x[i];
i += inc_x;
}
return(sumf);
}

View File

@ -1,425 +0,0 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4
.endm
.macro KERNEL_S4
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4
.endm
.macro KERNEL_S4
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
movs r12, #0 // clear floating point register
vmov s0, r12
vmov s1, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s1
#endif
cmp N, #0
ble asum_kernel_L999
cmp INC_X, #0
beq asum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble asum_kernel_F1
.align 5
asum_kernel_F4:
#if !defined(DOUBLE) && !defined(COMPLEX)
pld [ X, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble asum_kernel_F1
KERNEL_F4
subs I, I, #1
bne asum_kernel_F4
asum_kernel_F1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
b asum_kernel_L999
asum_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble asum_kernel_S1
.align 5
asum_kernel_S4:
KERNEL_S4
subs I, I, #1
bne asum_kernel_S4
asum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
asum_kernel_L999:
#if defined(DOUBLE)
vadd.f64 d0 , d0, d1 // set return value
#else
vadd.f32 s0 , s0, s1 // set return value
#endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif
bx lr
EPILOGUE

View File

@ -1,57 +0,0 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* trivial copy of zasum.c with the ABS() removed *
**************************************************************************************/
#include "common.h"
#include <math.h>
#define CSUM1(x,i) x[i]+x[i+1]
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
{
sumf += CSUM1(x,i);
i += inc_x2;
}
return(sumf);
}

View File

@ -1,175 +0,0 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -1,164 +0,0 @@
/*******************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
/******************************************************************************/
.macro KERNEL_F1
ld1 {v1.2s}, [X], #8
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F8
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
add X, X, #64
PRFM PLDL1KEEP, [X, #1024]
fadd v1.4s, v1.4s, v2.4s
fadd v3.4s, v3.4s, v4.4s
fadd v0.4s, v0.4s, v1.4s
fadd v0.4s, v0.4s, v3.4s
.endm
.macro KERNEL_F8_FINALIZE
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
.endm
.macro INIT_S
lsl INC_X, INC_X, #3
.endm
.macro KERNEL_S1
ld1 {v1.2s}, [X], INC_X
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
fmov s1, SUMF
cmp N, xzr
ble .Lcsum_kernel_L999
cmp INC_X, xzr
ble .Lcsum_kernel_L999
cmp INC_X, #1
bne .Lcsum_kernel_S_BEGIN
.Lcsum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq .Lcsum_kernel_F1
.Lcsum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne .Lcsum_kernel_F8
KERNEL_F8_FINALIZE
.Lcsum_kernel_F1:
ands I, N, #7
ble .Lcsum_kernel_L999
.Lcsum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Lcsum_kernel_F10
.Lcsum_kernel_L999:
ret
.Lcsum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Lcsum_kernel_S1
.Lcsum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Lcsum_kernel_S4
.Lcsum_kernel_S1:
ands I, N, #3
ble .Lcsum_kernel_L999
.Lcsum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Lcsum_kernel_S10
ret
EPILOGUE

View File

@ -1,186 +0,0 @@
/*******************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
add X, X, #64
PRFM PLDL1KEEP, [X, #1024]
fadd v2.2d, v2.2d, v3.2d
fadd v4.2d, v4.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v4.2d
#endif
.endm
.macro KERNEL_F8_FINALIZE
#if !defined(DOUBLE)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
#else
faddp SUMF, v0.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
#if !defined(DOUBLE)
fmov s1, SUMF
#else
fmov d1, SUMF
#endif
cmp N, xzr
ble .Lsum_kernel_L999
cmp INC_X, xzr
ble .Lsum_kernel_L999
cmp INC_X, #1
bne .Lsum_kernel_S_BEGIN
.Lsum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq .Lsum_kernel_F1
.Lsum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne .Lsum_kernel_F8
KERNEL_F8_FINALIZE
.Lsum_kernel_F1:
ands I, N, #7
ble .Lsum_kernel_L999
.Lsum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Lsum_kernel_F10
.Lsum_kernel_L999:
ret
.Lsum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Lsum_kernel_S1
.Lsum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Lsum_kernel_S4
.Lsum_kernel_S1:
ands I, N, #3
ble .Lsum_kernel_L999
.Lsum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Lsum_kernel_S10
ret
EPILOGUE

View File

@ -1,158 +0,0 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
/******************************************************************************/
.macro KERNEL_F1
ld1 {v1.2d}, [X], #16
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F4
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d
fadd v0.2d, v0.2d, v1.2d
fadd v0.2d, v0.2d, v3.2d
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F4_FINALIZE
faddp SUMF, v0.2d
.endm
.macro INIT_S
lsl INC_X, INC_X, #4
.endm
.macro KERNEL_S1
ld1 {v1.2d}, [X], INC_X
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
cmp N, xzr
ble .Lzsum_kernel_L999
cmp INC_X, xzr
ble .Lzsum_kernel_L999
cmp INC_X, #1
bne .Lzsum_kernel_S_BEGIN
.Lzsum_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq .Lzsum_kernel_F1
.Lzsum_kernel_F4:
KERNEL_F4
subs I, I, #1
bne .Lzsum_kernel_F4
KERNEL_F4_FINALIZE
.Lzsum_kernel_F1:
ands I, N, #3
ble .Lzsum_kernel_L999
.Lzsum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Lzsum_kernel_F10
.Lzsum_kernel_L999:
ret
.Lzsum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Lzsum_kernel_S1
.Lzsum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Lzsum_kernel_S4
.Lzsum_kernel_S1:
ands I, N, #3
ble .Lzsum_kernel_L999
.Lzsum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Lzsum_kernel_S10
ret
EPILOGUE

View File

@ -60,10 +60,6 @@ CASUMKERNEL = asum.S
ZASUMKERNEL = asum.S
XASUMKERNEL = asum.S
CSUMKERNEL = sum.S
ZSUMKERNEL = sum.S
XSUMKERNEL = sum.S
CNRM2KERNEL = nrm2.S
ZNRM2KERNEL = nrm2.S
XNRM2KERNEL = nrm2.S

View File

@ -1,358 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2019, The OpenBLAS project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifdef XDOUBLE
#define PREFETCH_SIZE ( 8 * 16 + 4)
#elif defined(DOUBLE)
#define PREFETCH_SIZE (16 * 16 + 8)
#else
#define PREFETCH_SIZE (32 * 16 + 16)
#endif
#ifndef COMPLEX
#define COMPADD 0
#define STRIDE INCX
#else
#define COMPADD 1
#define STRIDE SIZE
#endif
#define PRE1 r2
#define I r17
#define J r18
#define INCX16 r21
#define PR r30
#define ARLC r31
#define N r32
#define X r33
#define INCX r34
PROLOGUE
.prologue
PROFCODE
{ .mfi
adds PRE1 = PREFETCH_SIZE * SIZE, X
mov f8 = f0
.save ar.lc, ARLC
mov ARLC = ar.lc
}
;;
.body
#ifdef F_INTERFACE
{ .mmi
LDINT N = [N]
LDINT INCX = [INCX]
nop.i 0
}
;;
#ifndef USE64BITINT
{ .mii
nop.m 0
sxt4 N = N
sxt4 INCX = INCX
}
;;
#endif
#endif
{ .mmi
cmp.lt p0, p6 = r0, INCX
cmp.lt p0, p7 = r0, N
shr I = N, (4 - COMPADD)
}
{ .mbb
and J = ((1 << (4 - COMPADD)) - 1), N
(p6) br.ret.sptk.many b0
(p7) br.ret.sptk.many b0
}
;;
{ .mfi
adds I = -1, I
mov f10 = f0
mov PR = pr
}
{ .mfi
cmp.eq p9, p0 = r0, J
mov f9 = f0
tbit.z p0, p12 = N, 3 - COMPADD
}
;;
{ .mmi
cmp.eq p16, p0 = r0, r0
cmp.ne p17, p0 = r0, r0
mov ar.ec= 3
}
{ .mfi
cmp.ne p18, p0 = r0, r0
mov f11 = f0
shl INCX = INCX, BASE_SHIFT + COMPADD
}
;;
{ .mmi
#ifdef XDOUBLE
shladd INCX16 = INCX, (3 - COMPADD), r0
#else
shladd INCX16 = INCX, (4 - COMPADD), r0
#endif
cmp.ne p19, p0 = r0, r0
mov ar.lc = I
}
{ .mmb
cmp.gt p8 ,p0 = r0, I
#ifdef COMPLEX
adds INCX = - SIZE, INCX
#else
nop.m 0
#endif
(p8) br.cond.dpnt .L55
}
;;
.align 32
.L52:
{ .mmf
(p16) lfetch.nt1 [PRE1], INCX16
(p16) LDFD f32 = [X], STRIDE
}
{ .mfb
(p19) FADD f8 = f8, f71
}
;;
{ .mmf
(p16) LDFD f35 = [X], INCX
}
{ .mfb
(p19) FADD f9 = f9, f74
}
;;
{ .mmf
(p16) LDFD f38 = [X], STRIDE
}
{ .mfb
(p19) FADD f10 = f10, f77
}
;;
{ .mmf
(p16) LDFD f41 = [X], INCX
}
{ .mfb
(p19) FADD f11 = f11, f80
}
;;
{ .mmf
(p16) LDFD f44 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f34
}
;;
{ .mmf
(p16) LDFD f47 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f37
}
;;
{ .mmf
(p16) LDFD f50 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f40
}
;;
{ .mmf
(p16) LDFD f53 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f43
}
;;
{ .mmf
#ifdef XDOUBLE
(p16) lfetch.nt1 [PRE1], INCX16
#endif
(p16) LDFD f56 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f46
}
;;
{ .mmf
(p16) LDFD f59 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f49
}
;;
{ .mmf
(p16) LDFD f62 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f52
}
;;
{ .mmf
(p16) LDFD f65 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f55
}
;;
{ .mmf
(p16) LDFD f68 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f58
}
;;
{ .mmf
(p16) LDFD f71 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f61
}
;;
{ .mmf
(p16) LDFD f74 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f64
}
;;
{ .mmf
(p16) LDFD f77 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f67
br.ctop.sptk.few .L52
}
;;
FADD f8 = f8, f71
FADD f9 = f9, f74
FADD f10 = f10, f77
FADD f11 = f11, f80
.align 32
;;
.L55:
(p12) LDFD f32 = [X], STRIDE
(p9) br.cond.dptk .L998
;;
(p12) LDFD f33 = [X], INCX
;;
(p12) LDFD f34 = [X], STRIDE
;;
(p12) LDFD f35 = [X], INCX
tbit.z p0, p13 = N, (2 - COMPADD)
;;
(p12) LDFD f36 = [X], STRIDE
tbit.z p0, p14 = N, (1 - COMPADD)
;;
(p12) LDFD f37 = [X], INCX
#ifndef COMPLEX
tbit.z p0, p15 = N, 0
#endif
;;
(p12) LDFD f38 = [X], STRIDE
;;
(p12) LDFD f39 = [X], INCX
;;
(p13) LDFD f40 = [X], STRIDE
;;
(p13) LDFD f41 = [X], INCX
;;
(p13) LDFD f42 = [X], STRIDE
(p12) FADD f8 = f8, f32
;;
(p13) LDFD f43 = [X], INCX
(p12) FADD f9 = f9, f33
;;
(p14) LDFD f44 = [X], STRIDE
(p12) FADD f10 = f10, f34
;;
(p14) LDFD f45 = [X], INCX
(p12) FADD f11 = f11, f35
;;
#ifndef COMPLEX
(p15) LDFD f46 = [X]
#endif
(p12) FADD f8 = f8, f36
;;
(p12) FADD f9 = f9, f37
(p12) FADD f10 = f10, f38
(p12) FADD f11 = f11, f39
;;
(p13) FADD f8 = f8, f40
(p13) FADD f9 = f9, f41
#ifndef COMPLEX
#endif
(p13) FADD f10 = f10, f42
;;
(p13) FADD f11 = f11, f43
(p14) FADD f8 = f8, f44
(p14) FADD f9 = f9, f45
#ifndef COMPLEX
(p15) FADD f10 = f10, f46
#endif
;;
.align 32
.L998:
{ .mfi
FADD f8 = f8, f9
mov ar.lc = ARLC
}
{ .mmf
FADD f10 = f10, f11
}
;;
{ .mii
mov pr = PR, -65474
}
;;
{ .mfb
FADD f8 = f8, f10
br.ret.sptk.many b0
}
EPILOGUE

View File

@ -30,11 +30,6 @@ IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c
SSUMKERNEL = ../mips/sum.c
DSUMKERNEL = ../mips/sum.c
CSUMKERNEL = ../mips/zsum.c
ZSUMKERNEL = ../mips/zsum.c
ifdef HAVE_MSA
SASUMKERNEL = ../mips/sasum_msa.c
DASUMKERNEL = ../mips/dasum_msa.c

View File

@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
if( x[ix] < minf )
if( x[ix] > minf )
{
min = i;
minf = x[ix];

Some files were not shown because too many files have changed in this diff Show More