Merge branch 'release-0.2.16'
This commit is contained in:
commit
fced5744fb
|
@ -68,3 +68,4 @@ test/zblat2
|
|||
test/zblat3
|
||||
build
|
||||
build.*
|
||||
*.swp
|
||||
|
|
|
@ -24,7 +24,12 @@ before_install:
|
|||
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
|
||||
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
|
||||
|
||||
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||
script:
|
||||
- set -e
|
||||
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
|
|||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 14)
|
||||
set(OpenBLAS_PATCH_VERSION 16)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
|
@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH)
|
|||
list(APPEND BLASDIRS kernel)
|
||||
endif ()
|
||||
|
||||
if (DEFINED UTEST_CHECK)
|
||||
set(SANITY_CHECK 1)
|
||||
endif ()
|
||||
|
||||
if (DEFINED SANITY_CHECK)
|
||||
list(APPEND BLASDIRS reference)
|
||||
endif ()
|
||||
|
@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED})
|
|||
message(FATAL_ERROR "Neither static nor shared are enabled.")
|
||||
endif ()
|
||||
|
||||
#Set default output directory
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
|
@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET
|
|||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(utest)
|
||||
|
||||
if(NOT MSVC)
|
||||
#only build shared library for MSVC
|
||||
|
@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
|
|||
endif()
|
||||
|
||||
#build test and ctest
|
||||
enable_testing()
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
|
|
|
@ -121,6 +121,17 @@ In chronological order:
|
|||
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
|
||||
ARMv8 support.
|
||||
|
||||
* Jerome Robert <jeromerobert@gmx.com>
|
||||
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
|
||||
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
|
||||
* [2015-12-28] Allow to force the number of parallel make job
|
||||
* [2015-12-28] Fix detection of AMD E2-3200 detection
|
||||
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
|
||||
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
|
||||
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
|
||||
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
|
||||
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
|
||||
|
||||
* Dan Kortschak
|
||||
* [2015-01-07] Added test for drotmg bug #484.
|
||||
|
||||
|
@ -130,5 +141,11 @@ In chronological order:
|
|||
* Martin Koehler <https://github.com/grisuthedragon/>
|
||||
* [2015-09-07] Improved imatcopy
|
||||
|
||||
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
|
||||
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
|
||||
* [2015-11-20] lapack-test fixes for Cortex-A57
|
||||
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
|
||||
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
|
|
|
@ -1,4 +1,57 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.16
|
||||
15-Mar-2016
|
||||
common:
|
||||
* Avoid potential getenv segfault. (#716)
|
||||
* Import LAPACK svn bugfix #142-#147,#150-#155
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
|
||||
* Fix bug with scipy linalg test.
|
||||
|
||||
ARM:
|
||||
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize D and Z BLAS3 functions for Power8.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16.rc1
|
||||
23-Feb-2016
|
||||
common:
|
||||
* Upgrade LAPACK to 3.6.0 version.
|
||||
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
|
||||
LAPACK deprecated functions.
|
||||
* Add MAKE_NB_JOBS option in Makefile.
|
||||
Force number of make jobs.This is particularly
|
||||
useful when using distcc. (#735. Thanks, Jerome Robert.)
|
||||
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
|
||||
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
|
||||
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
|
||||
* Let openblas_get_num_threads return the number of active threads.
|
||||
(#760. Thanks, Jerome Robert)
|
||||
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
|
||||
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
|
||||
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
|
||||
* Detect Intel Avoton.
|
||||
* Detect AMD Trinity, Richland, E2-3200.
|
||||
* Fix gemv performance bug on Mac OSX Intel Haswell.
|
||||
* Fix some bugs with CMake and Visual Studio
|
||||
|
||||
ARM:
|
||||
* Support and optimize Cortex-A57 AArch64.
|
||||
(#686. Thanks, Ashwin Sekhar TK)
|
||||
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
|
||||
* Update ARMV6 kernels.
|
||||
|
||||
POWER:
|
||||
* Fix detection of POWER architecture
|
||||
(#684. Thanks, Sebastien Villemot)
|
||||
|
||||
====================================================================
|
||||
Version 0.2.15
|
||||
27-Oct-2015
|
||||
|
|
37
Makefile
37
Makefile
|
@ -7,10 +7,6 @@ ifneq ($(DYNAMIC_ARCH), 1)
|
|||
BLASDIRS += kernel
|
||||
endif
|
||||
|
||||
ifdef UTEST_CHECK
|
||||
SANITY_CHECK = 1
|
||||
endif
|
||||
|
||||
ifdef SANITY_CHECK
|
||||
BLASDIRS += reference
|
||||
endif
|
||||
|
@ -85,22 +81,22 @@ endif
|
|||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
|
@ -117,10 +113,8 @@ ifndef CROSS
|
|||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
ifdef UTEST_CHECK
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
endif
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
endif
|
||||
|
@ -249,16 +243,23 @@ ifndef NOFORTRAN
|
|||
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
ifeq ($(FC), gfortran)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
-@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
ifdef SMP
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
else
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
else
|
||||
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
|
||||
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
@ -288,7 +289,17 @@ endif
|
|||
lapack-test :
|
||||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
||||
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
endif
|
||||
|
||||
lapack-runtest:
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
|
||||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
||||
|
|
|
@ -11,8 +11,8 @@ endif
|
|||
|
||||
ifeq ($(CORE), ARMV7)
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
|
||||
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
|
||||
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
|
||||
else
|
||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
|
@ -29,5 +29,3 @@ ifeq ($(CORE), ARMV5)
|
|||
CCOMMON_OPT += -marm -march=armv5
|
||||
FCOMMON_OPT += -marm -march=armv5
|
||||
endif
|
||||
|
||||
|
||||
|
|
|
@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
|
|||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
endif
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ install : lib.grd
|
|||
#for inc
|
||||
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
|
@ -48,10 +48,10 @@ endif
|
|||
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
|
@ -64,7 +64,7 @@ endif
|
|||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.15
|
||||
VERSION = 0.2.16
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -79,6 +79,9 @@ VERSION = 0.2.15
|
|||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
# BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
|
@ -108,6 +111,10 @@ NO_AFFINITY = 1
|
|||
# Don't use parallel make.
|
||||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
# Force number of make jobs. The default is the number of logical CPU of the host.
|
||||
# This is particularly useful when using distcc
|
||||
# MAKE_NB_JOBS = 2
|
||||
|
||||
# If you would like to know minute performance report of GotoBLAS.
|
||||
# FUNCTION_PROFILE = 1
|
||||
|
||||
|
@ -138,10 +145,6 @@ NO_AFFINITY = 1
|
|||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
|
||||
# SANITY_CHECK to compare the result with reference BLAS.
|
||||
# UTEST_CHECK = 1
|
||||
|
||||
# The installation directory.
|
||||
# PREFIX = /opt/OpenBLAS
|
||||
|
||||
|
@ -159,10 +162,11 @@ COMMON_PROF = -pg
|
|||
# Build Debug version
|
||||
# DEBUG = 1
|
||||
|
||||
# Improve GEMV and GER for small matrices by stack allocation.
|
||||
# For details, https://github.com/xianyi/OpenBLAS/pull/482
|
||||
# Set maximum stack allocation.
|
||||
# The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
|
||||
# performance. For details, https://github.com/xianyi/OpenBLAS/pull/482
|
||||
#
|
||||
MAX_STACK_ALLOC=2048
|
||||
# MAX_STACK_ALLOC = 0
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoid conflicts with other BLAS libraries, especially when using
|
||||
|
|
|
@ -139,6 +139,10 @@ NO_PARALLEL_MAKE=0
|
|||
endif
|
||||
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
|
||||
|
||||
ifdef MAKE_NB_JOBS
|
||||
GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
|
||||
endif
|
||||
|
||||
ifeq ($(HOSTCC), loongcc)
|
||||
GETARCH_FLAGS += -static
|
||||
endif
|
||||
|
@ -292,12 +296,14 @@ endif
|
|||
ifneq ($(OSNAME), WINNT)
|
||||
ifneq ($(OSNAME), CYGWIN_NT)
|
||||
ifneq ($(OSNAME), Interix)
|
||||
ifneq ($(OSNAME), Android)
|
||||
ifdef SMP
|
||||
EXTRALIB += -lpthread
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
|
||||
|
@ -324,7 +330,8 @@ ifdef SANITY_CHECK
|
|||
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
||||
endif
|
||||
|
||||
ifdef MAX_STACK_ALLOC
|
||||
MAX_STACK_ALLOC ?= 2048
|
||||
ifneq ($(MAX_STACK_ALLOC), 0)
|
||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||
endif
|
||||
|
||||
|
@ -374,7 +381,7 @@ FCOMMON_OPT += -m128bit-long-double
|
|||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
EXPRECISION = 1
|
||||
CCOMMON_OPT += -DEXPRECISION
|
||||
CCOMMON_OPT += -DEXPRECISION
|
||||
FCOMMON_OPT += -m128bit-long-double
|
||||
endif
|
||||
endif
|
||||
|
@ -388,7 +395,7 @@ endif
|
|||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
|
||||
#check
|
||||
#check
|
||||
ifeq ($(USE_THREAD), 0)
|
||||
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
|
||||
endif
|
||||
|
@ -952,17 +959,18 @@ ifeq ($(OSNAME), SunOS)
|
|||
TAR = gtar
|
||||
PATCH = gpatch
|
||||
GREP = ggrep
|
||||
AWK = nawk
|
||||
else
|
||||
TAR = tar
|
||||
PATCH = patch
|
||||
GREP = grep
|
||||
AWK = awk
|
||||
endif
|
||||
|
||||
ifndef MD5SUM
|
||||
MD5SUM = md5sum
|
||||
endif
|
||||
|
||||
AWK = awk
|
||||
|
||||
REVISION = -r$(VERSION)
|
||||
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
|
||||
|
@ -971,16 +979,25 @@ ifeq ($(DEBUG), 1)
|
|||
COMMON_OPT += -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG), 1)
|
||||
FCOMMON_OPT += -g
|
||||
endif
|
||||
|
||||
ifndef COMMON_OPT
|
||||
COMMON_OPT = -O2
|
||||
endif
|
||||
|
||||
ifndef FCOMMON_OPT
|
||||
FCOMMON_OPT = -O2 -frecursive
|
||||
endif
|
||||
|
||||
|
||||
|
||||
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||
|
||||
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
|
||||
override FFLAGS += $(FCOMMON_OPT)
|
||||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||
#MAKEOVERRIDES =
|
||||
|
||||
#For LAPACK Fortran codes.
|
||||
|
@ -1170,4 +1187,3 @@ SUNPATH = /opt/sunstudio12.1
|
|||
else
|
||||
SUNPATH = /opt/SUNWspro
|
||||
endif
|
||||
|
||||
|
|
|
@ -75,10 +75,11 @@ Please read GotoBLAS_01Readme.txt
|
|||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
### Support OS:
|
||||
- **GNU/Linux**
|
||||
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
|
||||
|
|
|
@ -74,3 +74,5 @@ ARMV5
|
|||
|
||||
7.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
CORTEXA57
|
||||
|
||||
|
|
|
@ -0,0 +1,199 @@
|
|||
# Notes on OpenBLAS usage
|
||||
## Usage
|
||||
|
||||
#### Program is Terminated. Because you tried to allocate too many memory regions
|
||||
|
||||
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
|
||||
buffers as the following.
|
||||
```
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
```
|
||||
This error indicates that the program exceeded the number of buffers.
|
||||
|
||||
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
|
||||
following ways:
|
||||
|
||||
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
|
||||
* Call `openblas_set_num_threads(1)` in the application on runtime.
|
||||
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
|
||||
|
||||
If the application is parallelized by OpenMP, please use OpenBLAS built with
|
||||
`USE_OPENMP=1`
|
||||
|
||||
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
|
||||
|
||||
The environment variable which control the kernel selection is
|
||||
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
|
||||
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
|
||||
returns the used target.
|
||||
|
||||
#### How could I disable OpenBLAS threading affinity on runtime?
|
||||
|
||||
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
|
||||
variable to disable threading affinity on runtime. For example, before the
|
||||
running,
|
||||
```
|
||||
export OPENBLAS_MAIN_FREE=1
|
||||
```
|
||||
|
||||
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
|
||||
in `Makefile.rule`.
|
||||
|
||||
## Linking with the library
|
||||
|
||||
* Link with shared library
|
||||
|
||||
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
|
||||
|
||||
If the library is multithreaded, please add `-lpthread`. If the library
|
||||
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
|
||||
|
||||
* Link with static library
|
||||
|
||||
`gcc -o test test.c /your/path/libopenblas.a`
|
||||
|
||||
You can download `test.c` from https://gist.github.com/xianyi/5780018
|
||||
|
||||
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
|
||||
default), custom programs statically linked against `libopenblas.a` should also
|
||||
link with the pthread library e.g.:
|
||||
|
||||
```
|
||||
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
|
||||
```
|
||||
|
||||
Failing to add the `-lpthread` flag will cause errors such as:
|
||||
|
||||
```
|
||||
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
|
||||
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
|
||||
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
|
||||
...
|
||||
```
|
||||
|
||||
## Code examples
|
||||
|
||||
#### Call CBLAS interface
|
||||
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
|
||||
```
|
||||
#include <cblas.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void main()
|
||||
{
|
||||
int i=0;
|
||||
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
|
||||
|
||||
for(i=0; i<9; i++)
|
||||
printf("%lf ", C[i]);
|
||||
printf("\n");
|
||||
}
|
||||
```
|
||||
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
|
||||
|
||||
#### Call BLAS Fortran interface
|
||||
|
||||
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
|
||||
|
||||
```
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "sys/time.h"
|
||||
#include "time.h"
|
||||
|
||||
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int i;
|
||||
printf("test!\n");
|
||||
if(argc<4){
|
||||
printf("Input Error\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
int sizeofa = m * k;
|
||||
int sizeofb = k * n;
|
||||
int sizeofc = m * n;
|
||||
char ta = 'N';
|
||||
char tb = 'N';
|
||||
double alpha = 1.2;
|
||||
double beta = 0.001;
|
||||
|
||||
struct timeval start,finish;
|
||||
double duration;
|
||||
|
||||
double* A = (double*)malloc(sizeof(double) * sizeofa);
|
||||
double* B = (double*)malloc(sizeof(double) * sizeofb);
|
||||
double* C = (double*)malloc(sizeof(double) * sizeofc);
|
||||
|
||||
srand((unsigned)time(NULL));
|
||||
|
||||
for (i=0; i<sizeofa; i++)
|
||||
A[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofb; i++)
|
||||
B[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofc; i++)
|
||||
C[i] = i%3+1;//(rand()%100)/10.0;
|
||||
//#if 0
|
||||
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
|
||||
gettimeofday(&start, NULL);
|
||||
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
|
||||
gettimeofday(&finish, NULL);
|
||||
|
||||
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
|
||||
double gflops = 2.0 * m *n*k;
|
||||
gflops = gflops/duration*1.0e-6;
|
||||
|
||||
FILE *fp;
|
||||
fp = fopen("timeDGEMM.txt", "a");
|
||||
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
|
||||
fclose(fp);
|
||||
|
||||
free(A);
|
||||
free(B);
|
||||
free(C);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
|
||||
|
||||
` ./time_dgemm <m> <n> <k> `
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
## BLAS reference manual
|
||||
If you want to understand every BLAS function and definition, please read
|
||||
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
|
||||
or [netlib.org](http://netlib.org/blas/)
|
||||
|
||||
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
|
||||
|
||||
## How to reference OpenBLAS.
|
||||
|
||||
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
|
||||
|
||||
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
|
||||
|
|
@ -39,4 +39,6 @@ before_build:
|
|||
- cmake -G "Visual Studio 12 Win64" .
|
||||
|
||||
test_script:
|
||||
- echo Build OK!
|
||||
- echo Running Test
|
||||
- cd c:\projects\OpenBLAS\utest
|
||||
- openblas_utest
|
||||
|
|
|
@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
|||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||
|
|
|
@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
|
|||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(j = 0; j < to; j++){
|
||||
for(i = 0; i < to * COMPSIZE; i++){
|
||||
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import numpy
|
||||
from numpy import zeros
|
||||
from numpy.random import randn
|
||||
from scipy.linalg import blas
|
||||
|
||||
|
||||
def run_dsyrk(N, l):
|
||||
|
||||
A = randn(N, N).astype('float64', order='F')
|
||||
C = zeros((N, N), dtype='float64', order='F')
|
||||
|
||||
start = time.time()
|
||||
for i in range(0, l):
|
||||
blas.dsyrk(1.0, A, c=C, overwrite_c=True)
|
||||
end = time.time()
|
||||
|
||||
timediff = (end - start)
|
||||
mflops = (N * N * N) * l / timediff
|
||||
mflops *= 1e-6
|
||||
|
||||
size = "%dx%d" % (N, N)
|
||||
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
N = 128
|
||||
NMAX = 2048
|
||||
NINC = 128
|
||||
LOOPS = 1
|
||||
|
||||
z = 0
|
||||
for arg in sys.argv:
|
||||
if z == 1:
|
||||
N = int(arg)
|
||||
elif z == 2:
|
||||
NMAX = int(arg)
|
||||
elif z == 3:
|
||||
NINC = int(arg)
|
||||
elif z == 4:
|
||||
LOOPS = int(arg)
|
||||
|
||||
z = z + 1
|
||||
|
||||
if 'OPENBLAS_LOOPS' in os.environ:
|
||||
p = os.environ['OPENBLAS_LOOPS']
|
||||
if p:
|
||||
LOOPS = int(p)
|
||||
|
||||
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
|
||||
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
|
||||
|
||||
for i in range(N, NMAX + NINC, NINC):
|
||||
run_dsyrk(i, LOOPS)
|
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import numpy
|
||||
from numpy import zeros
|
||||
from numpy.random import randn
|
||||
from scipy.linalg import blas
|
||||
|
||||
|
||||
def run_ssyrk(N, l):
|
||||
|
||||
A = randn(N, N).astype('float32', order='F')
|
||||
C = zeros((N, N), dtype='float32', order='F')
|
||||
|
||||
start = time.time()
|
||||
for i in range(0, l):
|
||||
blas.ssyrk(1.0, A, c=C, overwrite_c=True)
|
||||
end = time.time()
|
||||
|
||||
timediff = (end - start)
|
||||
mflops = (N * N * N) * l / timediff
|
||||
mflops *= 1e-6
|
||||
|
||||
size = "%dx%d" % (N, N)
|
||||
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
N = 128
|
||||
NMAX = 2048
|
||||
NINC = 128
|
||||
LOOPS = 1
|
||||
|
||||
z = 0
|
||||
for arg in sys.argv:
|
||||
if z == 1:
|
||||
N = int(arg)
|
||||
elif z == 2:
|
||||
NMAX = int(arg)
|
||||
elif z == 3:
|
||||
NINC = int(arg)
|
||||
elif z == 4:
|
||||
LOOPS = int(arg)
|
||||
|
||||
z = z + 1
|
||||
|
||||
if 'OPENBLAS_LOOPS' in os.environ:
|
||||
p = os.environ['OPENBLAS_LOOPS']
|
||||
if p:
|
||||
LOOPS = int(p)
|
||||
|
||||
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
|
||||
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
|
||||
|
||||
for i in range(N, NMAX + NINC, NINC):
|
||||
run_ssyrk(i, LOOPS)
|
|
@ -0,0 +1,196 @@
|
|||
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cblas.h>
|
||||
#include <omp.h>
|
||||
#define MIN_SIZE 5
|
||||
#define MAX_SIZE 60
|
||||
#define NB_SIZE 10
|
||||
|
||||
// number of loop for a 1x1 matrix. Lower it if the test is
|
||||
// too slow on you computer.
|
||||
#define NLOOP 2e7
|
||||
|
||||
typedef struct {
|
||||
int matrix_size;
|
||||
int n_loop;
|
||||
void (* bench_func)();
|
||||
void (* blas_func)();
|
||||
void * (* create_matrix)(int size);
|
||||
} BenchParam;
|
||||
|
||||
void * s_create_matrix(int size) {
|
||||
float * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * c_create_matrix(int size) {
|
||||
float * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * z_create_matrix(int size) {
|
||||
double * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * d_create_matrix(int size) {
|
||||
double * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void trmv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void gemv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void ger_bench(BenchParam * param) {
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
void * pthread_func_wrapper(void * param) {
|
||||
((BenchParam *)param)->bench_func(param);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NB_TESTS 5
|
||||
void * TESTS[4 * NB_TESTS] = {
|
||||
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
|
||||
gemv_bench, dgemv_, d_create_matrix, "dgemv",
|
||||
gemv_bench, zgemv_, z_create_matrix, "zgemv",
|
||||
ger_bench, dger_, d_create_matrix, "dger",
|
||||
ger_bench, zgerc_, z_create_matrix, "zgerc",
|
||||
};
|
||||
|
||||
inline static double delta_time(struct timespec tick) {
|
||||
struct timespec tock;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tock);
|
||||
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
|
||||
}
|
||||
|
||||
double pthread_bench(BenchParam * param, int nb_threads)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return 0;
|
||||
#else
|
||||
BenchParam threaded_param = *param;
|
||||
pthread_t threads[nb_threads];
|
||||
int t, rc;
|
||||
struct timespec tick;
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
for(t=0; t<nb_threads; t++){
|
||||
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
|
||||
if (rc){
|
||||
printf("ERROR; return code from pthread_create() is %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
for(t=0; t<nb_threads; t++){
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
return delta_time(tick);
|
||||
#endif
|
||||
}
|
||||
|
||||
double seq_bench(BenchParam * param) {
|
||||
struct timespec tick;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
param->bench_func(param);
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
double omp_bench(BenchParam * param) {
|
||||
BenchParam threaded_param = *param;
|
||||
struct timespec tick;
|
||||
int t;
|
||||
int nb_threads = omp_get_max_threads();
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
#pragma omp parallel for
|
||||
for(t = 0; t < nb_threads; t ++){
|
||||
param->bench_func(&threaded_param);
|
||||
}
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
|
||||
BenchParam param;
|
||||
int test_id;
|
||||
printf ("Running on %d threads\n", omp_get_max_threads());
|
||||
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
|
||||
double size = MIN_SIZE;
|
||||
param.bench_func = TESTS[test_id * 4];
|
||||
param.blas_func = TESTS[test_id * 4 + 1];
|
||||
param.create_matrix = TESTS[test_id * 4 + 2];
|
||||
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
|
||||
param.n_loop = NLOOP;
|
||||
while(size <= MAX_SIZE) {
|
||||
param.matrix_size = (int)(size + 0.5);
|
||||
double seq_time = seq_bench(¶m);
|
||||
double omp_time = omp_bench(¶m);
|
||||
double pthread_time = pthread_bench(¶m, omp_get_max_threads());
|
||||
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
|
||||
"pthread %gs, speedup %g\n",
|
||||
param.matrix_size, seq_time,
|
||||
omp_time, seq_time / omp_time,
|
||||
pthread_time, seq_time / pthread_time);
|
||||
size *= inc_factor;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
1
c_check
1
c_check
|
@ -6,6 +6,7 @@ $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
|||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
|
|
|
@ -14,12 +14,12 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
|
|||
if (NOT NO_EXPRECISION)
|
||||
if (${F_COMPILER} MATCHES "GFORTRAN")
|
||||
# N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
|
||||
set(EXPRECISION 1)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
|
||||
endif ()
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
|
||||
set(EXPRECISION 1)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
|
||||
|
@ -28,35 +28,35 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
|
||||
endif ()
|
||||
|
||||
if (USE_OPENMP)
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
|
||||
message(WARNING "Clang doesn't support OpenMP yet.")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -openmp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
|
||||
set(CEXTRALIB "${CEXTRALIB} -lstdc++")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
|
||||
endif ()
|
||||
endif ()
|
||||
|
@ -87,7 +87,7 @@ if (${ARCH} STREQUAL "ia64")
|
|||
set(BINARY_DEFINED 1)
|
||||
|
||||
if (${F_COMPILER} MATCHES "GFORTRAN")
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
# EXPRECISION = 1
|
||||
# CCOMMON_OPT += -DEXPRECISION
|
||||
endif ()
|
||||
|
|
|
@ -48,18 +48,18 @@ set(SLASRC
|
|||
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
|
||||
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
|
||||
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
|
||||
sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
|
||||
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
|
||||
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
|
||||
sgetc2.f sgetri.f
|
||||
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
|
||||
sggglm.f sgghrd.f sgglse.f sggqrf.f
|
||||
sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
|
||||
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
|
||||
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
|
||||
slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
|
||||
DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
|
||||
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
|
||||
slansy.f slantb.f slantp.f slantr.f slanv2.f
|
||||
slapll.f slapmt.f
|
||||
|
@ -69,7 +69,7 @@ set(SLASRC
|
|||
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f
|
||||
slarrv.f slartv.f
|
||||
slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f
|
||||
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f
|
||||
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
|
||||
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
|
||||
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
|
||||
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
|
||||
|
@ -97,7 +97,7 @@ set(SLASRC
|
|||
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
|
||||
stptrs.f
|
||||
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
|
||||
strtrs.f stzrqf.f stzrzf.f sstemr.f
|
||||
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
|
||||
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
|
||||
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
|
||||
sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
|
||||
|
@ -114,14 +114,14 @@ set(CLASRC
|
|||
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
|
||||
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
|
||||
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
|
||||
cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
|
||||
cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
|
||||
cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
|
||||
DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
|
||||
cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
|
||||
DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
|
||||
cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f
|
||||
cgesvx.f cgetc2.f cgetri.f
|
||||
cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f
|
||||
cgghrd.f cgglse.f cggqrf.f cggrqf.f
|
||||
cggsvd.f cggsvp.f
|
||||
DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f
|
||||
cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
|
||||
chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f
|
||||
checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
|
||||
|
@ -138,7 +138,7 @@ set(CLASRC
|
|||
claed0.f claed7.f claed8.f
|
||||
claein.f claesy.f claev2.f clags2.f clagtm.f
|
||||
clahef.f clahef_rook.f clahqr.f
|
||||
clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
|
||||
DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
|
||||
clanhb.f clanhe.f
|
||||
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
|
||||
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
|
||||
|
@ -149,7 +149,7 @@ set(CLASRC
|
|||
clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
|
||||
clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
|
||||
clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
|
||||
DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
|
||||
cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
|
||||
cposv.f cposvx.f cpstrf.f cpstf2.f
|
||||
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
|
||||
|
@ -166,7 +166,7 @@ set(CLASRC
|
|||
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
|
||||
ctprfs.f ctptri.f
|
||||
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
|
||||
ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f
|
||||
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
|
||||
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
|
||||
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
|
||||
cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
|
||||
|
@ -186,18 +186,18 @@ set(DLASRC
|
|||
dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
|
||||
dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
|
||||
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
|
||||
dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
|
||||
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
|
||||
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
|
||||
dgetc2.f dgetri.f
|
||||
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
|
||||
dggglm.f dgghrd.f dgglse.f dggqrf.f
|
||||
dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
|
||||
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
|
||||
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
|
||||
dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
|
||||
DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
|
||||
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
|
||||
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
|
||||
dlapll.f dlapmt.f
|
||||
|
@ -207,7 +207,7 @@ set(DLASRC
|
|||
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f
|
||||
dlargv.f dlarrv.f dlartv.f
|
||||
dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f
|
||||
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f
|
||||
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
|
||||
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
|
||||
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
|
||||
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
|
||||
|
@ -235,7 +235,7 @@ set(DLASRC
|
|||
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
|
||||
dtptrs.f
|
||||
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
|
||||
dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f
|
||||
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
|
||||
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
|
||||
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
|
||||
dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
|
||||
|
@ -251,14 +251,14 @@ set(ZLASRC
|
|||
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
|
||||
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
|
||||
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
|
||||
zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
|
||||
zgetri.f
|
||||
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
|
||||
zgghrd.f zgglse.f zggqrf.f zggrqf.f
|
||||
zggsvd.f zggsvp.f
|
||||
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
|
||||
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
|
||||
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
|
||||
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
|
||||
|
@ -275,7 +275,7 @@ set(ZLASRC
|
|||
zlaed0.f zlaed7.f zlaed8.f
|
||||
zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
|
||||
zlahef.f zlahef_rook.f zlahqr.f
|
||||
zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
|
||||
DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
|
||||
zlangt.f zlanhb.f
|
||||
zlanhe.f
|
||||
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
|
||||
|
@ -288,7 +288,7 @@ set(ZLASRC
|
|||
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
|
||||
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
|
||||
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
|
||||
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
|
||||
|
@ -306,7 +306,7 @@ set(ZLASRC
|
|||
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
|
||||
ztprfs.f ztptri.f
|
||||
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
|
||||
ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f
|
||||
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
|
||||
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
|
||||
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
|
||||
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
|
||||
|
|
|
@ -2038,6 +2038,59 @@ set(MATGEN
|
|||
lapacke_zlagsy_work.c
|
||||
)
|
||||
|
||||
set(Utils_SRC
|
||||
lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c
|
||||
lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c
|
||||
lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c
|
||||
lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c
|
||||
lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c
|
||||
lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c
|
||||
lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c
|
||||
lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c
|
||||
lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c
|
||||
lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c
|
||||
lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c
|
||||
lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c
|
||||
lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c
|
||||
lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c
|
||||
lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c
|
||||
lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c
|
||||
lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c
|
||||
lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c
|
||||
lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c
|
||||
lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c
|
||||
lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c
|
||||
lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c
|
||||
lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c
|
||||
lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c
|
||||
lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c
|
||||
lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c
|
||||
lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c
|
||||
lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c
|
||||
lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c
|
||||
lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c
|
||||
lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c
|
||||
lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c
|
||||
lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c
|
||||
lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c
|
||||
lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c
|
||||
lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c
|
||||
lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c
|
||||
lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c
|
||||
lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c
|
||||
lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c
|
||||
lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c
|
||||
lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c
|
||||
lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c
|
||||
lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c
|
||||
lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c
|
||||
lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c
|
||||
lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c
|
||||
lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c
|
||||
lapacke_dpb_nancheck.c lapacke_sst_nancheck.c
|
||||
lapacke_dpb_trans.c lapacke_ssy_nancheck.c
|
||||
)
|
||||
|
||||
set(LAPACKE_REL_SRC "")
|
||||
if (BUILD_SINGLE)
|
||||
list(APPEND LAPACKE_REL_SRC ${SSRC})
|
||||
|
@ -2058,10 +2111,14 @@ endif ()
|
|||
# add lapack-netlib folder to the sources
|
||||
set(LAPACKE_SOURCES "")
|
||||
foreach (LAE_FILE ${LAPACKE_REL_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}")
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}")
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include")
|
||||
foreach (Utils_FILE ${Utils_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}")
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
|
9
common.h
9
common.h
|
@ -86,13 +86,14 @@ extern "C" {
|
|||
#if !defined(_MSC_VER)
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <time.h>
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#include <malloc.h>
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
|
@ -331,12 +332,13 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
|
@ -410,7 +412,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWS
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p))
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
#else
|
||||
typedef char* env_var_t;
|
||||
#define readenv(p, n) ((p)=getenv(n))
|
||||
|
@ -726,6 +728,7 @@ typedef struct {
|
|||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#include "common_stackalloc.h"
|
||||
#if 0
|
||||
#include "symcopy.h"
|
||||
#endif
|
||||
|
|
|
@ -43,28 +43,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
long register ret;
|
||||
BLASULONG ret;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"ldaxr %0, [%1] \n\t"
|
||||
"stlxr w2, %2, [%1] \n\t"
|
||||
"orr %0, %0, x2 \n\t"
|
||||
: "=r"(ret)
|
||||
: "r"(address), "r"(1l)
|
||||
: "memory", "x2"
|
||||
"mov x4, #1 \n\t"
|
||||
"1: \n\t"
|
||||
"ldaxr x2, [%1] \n\t"
|
||||
"cbnz x2, 1b \n\t"
|
||||
"2: \n\t"
|
||||
"stxr w3, x4, [%1] \n\t"
|
||||
"cbnz w3, 1b \n\t"
|
||||
"mov %0, #0 \n\t"
|
||||
: "=r"(ret), "=r"(address)
|
||||
: "1"(address)
|
||||
: "memory", "x2" , "x3", "x4"
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
} while (ret);
|
||||
MB;
|
||||
|
||||
}
|
||||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
@ -89,8 +100,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.func REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
REALNAME:
|
||||
|
||||
#define EPILOGUE
|
||||
|
@ -107,7 +120,11 @@ REALNAME:
|
|||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#if defined(CORTEXA57)
|
||||
#define BUFFER_SIZE (20 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
#endif
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
|
|
@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#define L1_DUALFETCH
|
||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#
|
||||
#ifndef L1_PREFETCH
|
||||
#define L1_PREFETCH dcbt
|
||||
#endif
|
||||
|
@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
|
|||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define STACK_ALLOC_PROTECT
|
||||
#ifdef STACK_ALLOC_PROTECT
|
||||
// Try to detect stack smashing
|
||||
#include <assert.h>
|
||||
#define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234;
|
||||
#define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234);
|
||||
#else
|
||||
#define STACK_ALLOC_PROTECT_SET
|
||||
#define STACK_ALLOC_PROTECT_CHECK
|
||||
#endif
|
||||
|
||||
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
|
||||
/*
|
||||
* Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC.
|
||||
* Stack allocation is much faster than blas_memory_alloc or malloc, particularly
|
||||
* when OpenBLAS is used from a multi-threaded application.
|
||||
* SIZE must be carefully chosen to be:
|
||||
* - as small as possible to maximize the number of stack allocation
|
||||
* - large enough to support all architectures and kernel
|
||||
* Chosing a too small SIZE will lead to a stack smashing.
|
||||
*/
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
/* do not restore all register */ \
|
||||
volatile int stack_alloc_size = SIZE; \
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
||||
stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
#define STACK_FREE(BUFFER) \
|
||||
STACK_ALLOC_PROTECT_CHECK \
|
||||
if(!stack_alloc_size) \
|
||||
blas_memory_free(BUFFER);
|
||||
#else
|
||||
#define STACK_FREE(BUFFER) blas_memory_free(BUFFER)
|
||||
#endif
|
||||
|
13
common_x86.h
13
common_x86.h
|
@ -41,6 +41,10 @@
|
|||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
#ifdef C_MSVC
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
|
||||
|
@ -170,12 +174,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
if (y <= 1) return x;
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
result = x/y;
|
||||
return result;
|
||||
#else
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
(void*)result;
|
||||
return x*y;
|
||||
#else
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
|
||||
return result;
|
||||
|
|
|
@ -396,7 +396,7 @@ REALNAME:
|
|||
|
||||
#define PROFCODE
|
||||
|
||||
#define EPILOGUE .end REALNAME
|
||||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
|
|
24
cpuid_arm.c
24
cpuid_arm.c
|
@ -115,6 +115,9 @@ int detect(void)
|
|||
if (strstr(p, "0xc0f")) {
|
||||
return CPU_CORTEXA15;
|
||||
}
|
||||
if (strstr(p, "0xd07")) {
|
||||
return CPU_ARMV7; //ARMV8 on 32-bit
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -158,6 +161,27 @@ int detect(void)
|
|||
|
||||
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if ((!strncmp("CPU architecture", buffer, 16)))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL) {
|
||||
if (strstr(p, "8")) {
|
||||
return CPU_ARMV7; //ARMV8 on 32-bit
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return CPU_UNKNOWN;
|
||||
|
|
|
@ -29,12 +29,19 @@
|
|||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV8 1
|
||||
#define CPU_CORTEXA57 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"ARMV8"
|
||||
"UNKNOWN",
|
||||
"ARMV8" ,
|
||||
"CORTEXA57"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"unknown",
|
||||
"armv8" ,
|
||||
"cortexa57"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
@ -53,13 +60,13 @@ int get_feature(char *search)
|
|||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
fclose(infile);
|
||||
|
||||
|
||||
if( p == NULL ) return;
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
|
@ -82,11 +89,30 @@ int detect(void)
|
|||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
|
||||
if (!strncmp("CPU part", buffer, 8))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
if(p != NULL) {
|
||||
if (strstr(p, "0xd07")) {
|
||||
return CPU_CORTEXA57;
|
||||
}
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) ||
|
||||
(!strncmp("CPU architecture", buffer, 16)))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
|
@ -100,7 +126,7 @@ int detect(void)
|
|||
|
||||
if (strstr(p, "AArch64"))
|
||||
{
|
||||
return CPU_ARMV8;
|
||||
return CPU_ARMV8;
|
||||
|
||||
}
|
||||
|
||||
|
@ -118,23 +144,13 @@ char *get_corename(void)
|
|||
|
||||
void get_architecture(void)
|
||||
{
|
||||
printf("ARM");
|
||||
printf("ARM64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void)
|
||||
{
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_ARMV8:
|
||||
printf("ARMV8");
|
||||
break;
|
||||
|
||||
default:
|
||||
printf("UNKNOWN");
|
||||
break;
|
||||
}
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void)
|
||||
|
@ -160,26 +176,34 @@ void get_cpuconfig(void)
|
|||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
printf("#define CORTEXA57\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
printf("#define HAVE_VFPV3\n");
|
||||
printf("#define HAVE_NEON\n");
|
||||
printf("#define HAVE_VFPV4\n");
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_ARMV8:
|
||||
printf("armv8\n");
|
||||
break;
|
||||
|
||||
}
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
||||
|
||||
void get_features(void)
|
||||
{
|
||||
|
||||
|
|
|
@ -55,6 +55,7 @@
|
|||
#define CPUTYPE_POWER6 5
|
||||
#define CPUTYPE_CELL 6
|
||||
#define CPUTYPE_PPCG4 7
|
||||
#define CPUTYPE_POWER8 8
|
||||
|
||||
char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
@ -65,6 +66,7 @@ char *cpuname[] = {
|
|||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
};
|
||||
|
||||
char *lowercpuname[] = {
|
||||
|
@ -76,6 +78,7 @@ char *lowercpuname[] = {
|
|||
"power6",
|
||||
"cell",
|
||||
"ppcg4",
|
||||
"power8"
|
||||
};
|
||||
|
||||
char *corename[] = {
|
||||
|
@ -87,6 +90,7 @@ char *corename[] = {
|
|||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -115,7 +119,7 @@ int detect(void){
|
|||
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
|
||||
|
|
24
cpuid_x86.c
24
cpuid_x86.c
|
@ -1172,6 +1172,9 @@ int get_cpuname(void){
|
|||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 13:
|
||||
// Avoton
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
|
@ -1229,6 +1232,7 @@ int get_cpuname(void){
|
|||
case 2:
|
||||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 3:
|
||||
case 10:
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 6:
|
||||
|
@ -1239,13 +1243,19 @@ int get_cpuname(void){
|
|||
return CPUTYPE_BULLDOZER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 2:
|
||||
case 2: //AMD Piledriver
|
||||
case 3: //AMD Richland
|
||||
if(support_avx())
|
||||
return CPUTYPE_PILEDRIVER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 0:
|
||||
switch(exmodel){
|
||||
case 1: //AMD Trinity
|
||||
if(support_avx())
|
||||
return CPUTYPE_PILEDRIVER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 3:
|
||||
if(support_avx())
|
||||
return CPUTYPE_STEAMROLLER;
|
||||
|
@ -1668,6 +1678,9 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 13:
|
||||
// Avoton
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
|
@ -1718,7 +1731,8 @@ int get_coretype(void){
|
|||
return CORE_BULLDOZER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
case 2:
|
||||
case 2: //AMD Piledriver
|
||||
case 3: //AMD Richland
|
||||
if(support_avx())
|
||||
return CORE_PILEDRIVER;
|
||||
else
|
||||
|
@ -1726,6 +1740,12 @@ int get_coretype(void){
|
|||
|
||||
case 0:
|
||||
switch(exmodel){
|
||||
case 1: //AMD Trinity
|
||||
if(support_avx())
|
||||
return CORE_PILEDRIVER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
|
||||
case 3:
|
||||
if(support_avx())
|
||||
return CORE_STEAMROLLER;
|
||||
|
|
|
@ -1365,8 +1365,9 @@
|
|||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
|
|
@ -1365,8 +1365,9 @@
|
|||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
|
|
@ -1335,8 +1335,9 @@
|
|||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
|
|
@ -1339,8 +1339,9 @@
|
|||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
|
|
@ -1350,7 +1350,7 @@
|
|||
*
|
||||
* Call the subroutine.
|
||||
*
|
||||
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
|
||||
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
|
||||
IF( FULL )THEN
|
||||
IF( TRACE )
|
||||
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
|
||||
|
@ -1376,7 +1376,7 @@
|
|||
CALL CZTPMV( IORDER, UPLO, TRANS, DIAG,
|
||||
$ N, AA, XX, INCX )
|
||||
END IF
|
||||
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
|
||||
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
|
||||
IF( FULL )THEN
|
||||
IF( TRACE )
|
||||
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
|
||||
|
@ -1465,7 +1465,7 @@
|
|||
END IF
|
||||
*
|
||||
IF( .NOT.NULL )THEN
|
||||
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
|
||||
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
|
||||
*
|
||||
* Check the result.
|
||||
*
|
||||
|
@ -1473,7 +1473,7 @@
|
|||
$ INCX, ZERO, Z, INCX, XT, G,
|
||||
$ XX, EPS, ERR, FATAL, NOUT,
|
||||
$ .TRUE. )
|
||||
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
|
||||
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
|
||||
*
|
||||
* Compute approximation to original vector.
|
||||
*
|
||||
|
@ -1611,7 +1611,7 @@
|
|||
* .. Common blocks ..
|
||||
COMMON /INFOC/INFOT, NOUTC, OK
|
||||
* .. Executable Statements ..
|
||||
CONJ = SNAME( 5: 5 ).EQ.'c'
|
||||
CONJ = SNAME( 11: 11 ).EQ.'c'
|
||||
* Define the number of arguments.
|
||||
NARGS = 9
|
||||
*
|
||||
|
|
|
@ -1366,8 +1366,9 @@
|
|||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
|
|
@ -1366,8 +1366,9 @@
|
|||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
BLASLONG incx, incy;
|
||||
BLASLONG incx;
|
||||
BLASLONG m_from, m_to, i;
|
||||
#ifndef COMPLEX
|
||||
FLOAT result;
|
||||
|
@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
y = (FLOAT *)args -> c;
|
||||
|
||||
incx = args -> ldb;
|
||||
incy = args -> ldc;
|
||||
|
||||
m_from = 0;
|
||||
m_to = args -> m;
|
||||
|
|
|
@ -43,7 +43,7 @@
|
|||
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
BLASLONG lda, incx, incy;
|
||||
BLASLONG incx, incy;
|
||||
BLASLONG i, m_from, m_to;
|
||||
FLOAT alpha_r;
|
||||
#ifdef COMPLEX
|
||||
|
@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
|
|||
|
||||
incx = args -> lda;
|
||||
incy = args -> ldb;
|
||||
lda = args -> ldc;
|
||||
|
||||
alpha_r = *((FLOAT *)args -> alpha + 0);
|
||||
#ifdef COMPLEX
|
||||
|
|
|
@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
|
|||
BLASLONG incx;
|
||||
BLASLONG i, m_from, m_to;
|
||||
FLOAT alpha_r;
|
||||
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
|
||||
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
|
||||
FLOAT alpha_i;
|
||||
#endif
|
||||
|
||||
|
@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
|
|||
incx = args -> lda;
|
||||
|
||||
alpha_r = *((FLOAT *)args -> alpha + 0);
|
||||
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
|
||||
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
|
||||
alpha_i = *((FLOAT *)args -> alpha + 1);
|
||||
#endif
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
BLASLONG lda, incx, incy;
|
||||
BLASLONG lda, incx;
|
||||
BLASLONG m_from, m_to;
|
||||
|
||||
a = (FLOAT *)args -> a;
|
||||
|
@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
|
||||
lda = args -> lda;
|
||||
incx = args -> ldb;
|
||||
incy = args -> ldc;
|
||||
|
||||
m_from = 0;
|
||||
m_to = args -> m;
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -43,12 +43,10 @@
|
|||
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -43,12 +43,10 @@
|
|||
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
#endif
|
||||
|
||||
x = buffer;
|
||||
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
|
||||
buffer += ((COMPSIZE * args -> m + 3) & ~3);
|
||||
}
|
||||
|
||||
#ifndef TRANS
|
||||
|
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
|
||||
if (num_cpu) {
|
||||
queue[0].sa = NULL;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
|
||||
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT atemp1, atemp2, btemp1, btemp2;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT atemp1, atemp2, btemp1, btemp2;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT ar, ai, br, bi, ratio, den;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT ar, ai, br, bi, ratio, den;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,8 +48,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
# TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
|
||||
|
||||
# Need to set CONJ for trmm and trsm
|
||||
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type})
|
||||
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type})
|
||||
|
@ -72,6 +71,10 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type})
|
||||
|
||||
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
#herk
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
|
||||
|
||||
#hemm
|
||||
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type})
|
||||
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type})
|
||||
|
@ -96,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
# for gemm3m
|
||||
if(USE_GEMM3M)
|
||||
foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
||||
string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
endif ()
|
||||
endforeach ()
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;
|
||||
BLASLONG procs, num_cpu_m, num_cpu_n;
|
||||
|
||||
BLASLONG width, i, j;
|
||||
BLASLONG divM, divN;
|
||||
|
|
|
@ -335,7 +335,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
BLASLONG is, min_i, div_n;
|
||||
|
||||
BLASLONG i, current;
|
||||
BLASLONG l1stride, l2size;
|
||||
BLASLONG l1stride;
|
||||
|
||||
#ifdef TIMING
|
||||
BLASULONG rpcc_counter;
|
||||
|
@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#endif
|
||||
) return 0;
|
||||
|
||||
l2size = GEMM_P * GEMM_Q;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
|
||||
mypos, m_from, m_to, n_from, n_to, N_from, N_to);
|
||||
|
@ -369,7 +367,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
|
||||
START_RPCC();
|
||||
|
@ -706,7 +706,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
n = n_to - n_from;
|
||||
}
|
||||
|
||||
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
|
||||
if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
|
||||
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ set(COMMON_SOURCES
|
|||
xerbla.c
|
||||
openblas_set_num_threads.c
|
||||
openblas_error_handle.c
|
||||
openblas_env.c
|
||||
openblas_get_num_procs.c
|
||||
openblas_get_num_threads.c
|
||||
)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
TOPDIR = ../..
|
||||
include ../../Makefile.system
|
||||
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
|
||||
|
||||
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||
|
||||
|
@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
|
|||
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_env.$(SUFFIX) : openblas_env.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
|
@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#endif
|
||||
|
||||
extern unsigned int openblas_thread_timeout();
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
|
||||
#undef MONITOR
|
||||
|
@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
|
|||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
int thread_timeout_env;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
|
@ -540,22 +543,12 @@ int blas_thread_init(void){
|
|||
|
||||
if (!blas_server_avail){
|
||||
|
||||
env_var_t p;
|
||||
|
||||
if (readenv(p,"THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}else{
|
||||
if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}
|
||||
}
|
||||
|
||||
thread_timeout_env=openblas_thread_timeout();
|
||||
if (thread_timeout_env>0) {
|
||||
if (thread_timeout_env < 4) thread_timeout_env = 4;
|
||||
if (thread_timeout_env > 30) thread_timeout_env = 30;
|
||||
thread_timeout = (1 << thread_timeout_env);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
|
||||
|
@ -576,10 +569,12 @@ int blas_thread_init(void){
|
|||
struct rlimit rlim;
|
||||
const char *msg = strerror(ret);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
|
||||
#ifdef RLIMIT_NPROC
|
||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||
"%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
|
||||
}
|
||||
#endif
|
||||
if(0 != raise(SIGINT)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
|
||||
exit(EXIT_FAILURE);
|
||||
|
|
|
@ -261,6 +261,11 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Avoton
|
||||
if (model == 13) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 5:
|
||||
//Intel Broadwell
|
||||
|
@ -318,7 +323,7 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 2){
|
||||
}else if(model == 2 || model == 3){
|
||||
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
|
@ -327,7 +332,15 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0){
|
||||
if (exmodel == 3) {
|
||||
if (exmodel == 1) {
|
||||
//AMD Trinity
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 3) {
|
||||
//AMD STEAMROLLER
|
||||
if(support_avx())
|
||||
return &gotoblas_STEAMROLLER;
|
||||
|
@ -378,7 +391,7 @@ static char *corename[] = {
|
|||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron(SSE3)",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
|
|
|
@ -104,6 +104,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||
|
@ -142,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define CONSTRUCTOR __cdecl
|
||||
#define DESTRUCTOR __cdecl
|
||||
#elif defined(OS_DARWIN) && defined(C_GCC)
|
||||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
|
@ -167,7 +169,7 @@ void goto_set_num_threads(int num_threads) {};
|
|||
|
||||
#else
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
|
@ -292,8 +294,11 @@ void openblas_fork_handler()
|
|||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
env_var_t p;
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
|
@ -308,18 +313,18 @@ int blas_get_cpu_number(void){
|
|||
|
||||
blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
|
@ -355,7 +360,9 @@ int openblas_get_num_threads(void) {
|
|||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return blas_get_cpu_number();
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -914,7 +921,6 @@ static volatile struct {
|
|||
} memory[NUM_BUFFERS];
|
||||
|
||||
static int memory_initialized = 0;
|
||||
static void gotoblas_memory_init(void);
|
||||
|
||||
/* Memory allocation routine */
|
||||
/* procpos ... indicates where it comes from */
|
||||
|
@ -1337,6 +1343,7 @@ static void gotoblas_memory_init(void) {
|
|||
/* Initialization for all function; this function should be called before main */
|
||||
|
||||
static int gotoblas_initialized = 0;
|
||||
extern void openblas_read_env();
|
||||
|
||||
void CONSTRUCTOR gotoblas_init(void) {
|
||||
|
||||
|
@ -1346,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
|||
openblas_fork_handler();
|
||||
#endif
|
||||
|
||||
openblas_read_env();
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
|
@ -1362,6 +1371,19 @@ void CONSTRUCTOR gotoblas_init(void) {
|
|||
gotoblas_memory_init();
|
||||
#endif
|
||||
|
||||
//#if defined(OS_LINUX)
|
||||
#if 0
|
||||
struct rlimit curlimit;
|
||||
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
|
||||
{
|
||||
if ( curlimit.rlim_cur != curlimit.rlim_max )
|
||||
{
|
||||
curlimit.rlim_cur = curlimit.rlim_max;
|
||||
setrlimit(RLIMIT_STACK, &curlimit);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
if (blas_cpu_number == 0) blas_get_cpu_number();
|
||||
#ifdef SMP_SERVER
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static int openblas_env_verbose=0;
|
||||
static unsigned int openblas_env_thread_timeout=0;
|
||||
static int openblas_env_block_factor=0;
|
||||
static int openblas_env_openblas_num_threads=0;
|
||||
static int openblas_env_goto_num_threads=0;
|
||||
static int openblas_env_omp_num_threads=0;
|
||||
|
||||
int openblas_verbose() { return openblas_env_verbose;}
|
||||
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
|
||||
int openblas_block_factor() { return openblas_env_block_factor;}
|
||||
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
|
||||
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
|
||||
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
|
||||
|
||||
void openblas_read_env() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_verbose=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_block_factor=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_thread_timeout=(unsigned int)ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_openblas_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_goto_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_omp_num_threads=ret;
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
int openblas_verbose() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
return ret;
|
||||
}
|
||||
extern int openblas_verbose();
|
||||
|
||||
void openblas_warning(int verbose, const char * msg) {
|
||||
int current_verbose;
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
#include <string.h>
|
||||
#include "common.h"
|
||||
|
||||
extern int openblas_block_factor();
|
||||
int get_L2_size(void);
|
||||
|
||||
#define DEFAULT_GEMM_P 128
|
||||
|
@ -249,7 +250,6 @@ int get_L2_size(void){
|
|||
|
||||
void blas_set_parameter(void){
|
||||
|
||||
env_var_t p;
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
int size = 16;
|
||||
|
@ -468,9 +468,8 @@ void blas_set_parameter(void){
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
if (readenv(p,"GOTO_BLOCK_FACTOR")) {
|
||||
factor = atoi(p);
|
||||
factor=openblas_block_factor();
|
||||
if (factor>0) {
|
||||
if (factor < 10) factor = 10;
|
||||
if (factor > 200) factor = 200;
|
||||
|
||||
|
|
|
@ -26,10 +26,16 @@ ifndef ONLY_CBLAS
|
|||
ONLY_CBLAS = 0
|
||||
endif
|
||||
|
||||
ifndef BUILD_LAPACK_DEPRECATED
|
||||
BUILD_LAPACK_DEPRECATED = 0
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifndef ONLY_CBLAS
|
||||
EXTRALIB += -lgfortran
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
EXTRALIB += -lgomp
|
||||
|
@ -39,9 +45,11 @@ endif
|
|||
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifndef ONLY_CBLAS
|
||||
EXTRALIB += -lgfortran
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
all::
|
||||
|
||||
|
@ -88,17 +96,17 @@ dll : ../$(LIBDLLNAME)
|
|||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
libopenblas.def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
else
|
||||
../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def
|
||||
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
|
||||
../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def
|
||||
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed
|
||||
$(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
|
||||
#only build without Fortran
|
||||
|
@ -110,7 +118,7 @@ endif
|
|||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
@ -201,26 +209,26 @@ static : ../$(LIBNAME)
|
|||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF*
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
||||
include ../Makefile.tail
|
||||
|
||||
|
|
|
@ -173,18 +173,18 @@
|
|||
sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv,
|
||||
sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2,
|
||||
sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx,
|
||||
sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf,
|
||||
sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf,
|
||||
sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs,
|
||||
sgehd2, sgehrd, sgelq2, sgelqf,
|
||||
sgels, sgelsd, sgelss, sgelsy, sgeql2, sgeqlf,
|
||||
sgeqp3, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs,
|
||||
sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx,
|
||||
sgetc2, sgetri,
|
||||
sggbak, sggbal, sgges, sggesx, sggev, sggevx,
|
||||
sggglm, sgghrd, sgglse, sggqrf,
|
||||
sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv,
|
||||
sggrqf, sgtcon, sgtrfs, sgtsv,
|
||||
sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz,
|
||||
shsein, shseqr, slabrd, slacon, slacn2,
|
||||
slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr,
|
||||
slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd,
|
||||
slahr2, slaic1, slaln2, slals0, slalsa, slalsd,
|
||||
slangb, slange, slangt, slanhs, slansb, slansp,
|
||||
slansy, slantb, slantp, slantr, slanv2,
|
||||
slapll, slapmt,
|
||||
|
@ -194,7 +194,7 @@
|
|||
slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv,
|
||||
slarrv, slartv,
|
||||
slarz, slarzb, slarzt, slasy2, slasyf,
|
||||
slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm,
|
||||
slatbs, slatdf, slatps, slatrd, slatrs, slatrz,
|
||||
sopgtr, sopmtr, sorg2l, sorg2r,
|
||||
sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2,
|
||||
sorgrq, sorgtr, sorm2l, sorm2r,
|
||||
|
@ -220,7 +220,7 @@
|
|||
stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri,
|
||||
stptrs,
|
||||
strcon, strevc, strexc, strrfs, strsen, strsna, strsyl,
|
||||
strtrs, stzrqf, stzrzf, sstemr,
|
||||
strtrs, stzrzf, sstemr,
|
||||
slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp,
|
||||
stfttr, stpttf, stpttr, strttf, strttp,
|
||||
sgejsv, sgesvj, sgsvj0, sgsvj1,
|
||||
|
@ -245,14 +245,13 @@
|
|||
cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx,
|
||||
cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd,
|
||||
cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx,
|
||||
cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf,
|
||||
cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3,
|
||||
cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs,
|
||||
cgehd2, cgehrd, cgelq2, cgelqf,
|
||||
cgels, cgelsd, cgelss, cgelsy, cgeql2, cgeqlf, cgeqp3,
|
||||
cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs,
|
||||
cgerq2, cgerqf, cgesc2, cgesdd, cgesvd,
|
||||
cgesvx, cgetc2, cgetri,
|
||||
cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm,
|
||||
cgghrd, cgglse, cggqrf, cggrqf,
|
||||
cggsvd, cggsvp,
|
||||
cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev,
|
||||
chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd,
|
||||
checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst,
|
||||
|
@ -267,7 +266,7 @@
|
|||
claed0, claed7, claed8,
|
||||
claein, claesy, claev2, clags2, clagtm,
|
||||
clahef, clahqr,
|
||||
clahrd, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt,
|
||||
clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt,
|
||||
clanhb, clanhe,
|
||||
clanhp, clanhs, clanht, clansb, clansp, clansy, clantb,
|
||||
clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge,
|
||||
|
@ -278,7 +277,7 @@
|
|||
clarfx, clargv, clarnv, clarrv, clartg, clartv,
|
||||
clarz, clarzb, clarzt, clascl, claset, clasr, classq,
|
||||
clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz,
|
||||
clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv,
|
||||
cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv,
|
||||
cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs,
|
||||
cposv, cposvx, cpstrf, cpstf2,
|
||||
cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs,
|
||||
|
@ -293,7 +292,7 @@
|
|||
ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon,
|
||||
ctprfs, ctptri,
|
||||
ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna,
|
||||
ctrsyl, ctrtrs, ctzrqf, ctzrzf, cung2l, cung2r,
|
||||
ctrsyl, ctrtrs, ctzrzf, cung2l, cung2r,
|
||||
cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2,
|
||||
cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2,
|
||||
cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz,
|
||||
|
@ -321,18 +320,18 @@
|
|||
dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv,
|
||||
dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2,
|
||||
dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx,
|
||||
dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf,
|
||||
dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf,
|
||||
dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs,
|
||||
dgehd2, dgehrd, dgelq2, dgelqf,
|
||||
dgels, dgelsd, dgelss, dgelsy, dgeql2, dgeqlf,
|
||||
dgeqp3, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs,
|
||||
dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx,
|
||||
dgetc2, dgetri,
|
||||
dggbak, dggbal, dgges, dggesx, dggev, dggevx,
|
||||
dggglm, dgghrd, dgglse, dggqrf,
|
||||
dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv,
|
||||
dggrqf, dgtcon, dgtrfs, dgtsv,
|
||||
dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz,
|
||||
dhsein, dhseqr, dlabrd, dlacon, dlacn2,
|
||||
dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr,
|
||||
dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd,
|
||||
dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd,
|
||||
dlangb, dlange, dlangt, dlanhs, dlansb, dlansp,
|
||||
dlansy, dlantb, dlantp, dlantr, dlanv2,
|
||||
dlapll, dlapmt,
|
||||
|
@ -342,7 +341,7 @@
|
|||
dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx,
|
||||
dlargv, dlarrv, dlartv,
|
||||
dlarz, dlarzb, dlarzt, dlasy2, dlasyf,
|
||||
dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm,
|
||||
dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz,
|
||||
dopgtr, dopmtr, dorg2l, dorg2r,
|
||||
dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2,
|
||||
dorgrq, dorgtr, dorm2l, dorm2r,
|
||||
|
@ -368,7 +367,7 @@
|
|||
dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri,
|
||||
dtptrs,
|
||||
dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl,
|
||||
dtrtrs, dtzrqf, dtzrzf, dstemr,
|
||||
dtrtrs, dtzrzf, dstemr,
|
||||
dsgesv, dsposv, dlag2s, slag2d, dlat2s,
|
||||
dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp,
|
||||
dtfttr, dtpttf, dtpttr, dtrttf, dtrttp,
|
||||
|
@ -387,14 +386,13 @@
|
|||
zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx,
|
||||
zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd,
|
||||
zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx,
|
||||
zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf,
|
||||
zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3,
|
||||
zgeqpf, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf,
|
||||
zgehd2, zgehrd, zgelq2, zgelqf,
|
||||
zgels, zgelsd, zgelss, zgelsy, zgeql2, zgeqlf, zgeqp3,
|
||||
zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf,
|
||||
zgesc2, zgesdd, zgesvd, zgesvx, zgetc2,
|
||||
zgetri,
|
||||
zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm,
|
||||
zgghrd, zgglse, zggqrf, zggrqf,
|
||||
zggsvd, zggsvp,
|
||||
zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev,
|
||||
zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd,
|
||||
zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst,
|
||||
|
@ -409,7 +407,7 @@
|
|||
zlaed0, zlaed7, zlaed8,
|
||||
zlaein, zlaesy, zlaev2, zlags2, zlagtm,
|
||||
zlahef, zlahqr,
|
||||
zlahrd, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange,
|
||||
zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange,
|
||||
zlangt, zlanhb,
|
||||
zlanhe,
|
||||
zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb,
|
||||
|
@ -422,7 +420,7 @@
|
|||
zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv,
|
||||
zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr,
|
||||
zlassq, zlasyf,
|
||||
zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm,
|
||||
zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz,
|
||||
zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv,
|
||||
zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs,
|
||||
zposv, zposvx, zpotrs, zpstrf, zpstf2,
|
||||
|
@ -438,7 +436,7 @@
|
|||
ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon,
|
||||
ztprfs, ztptri,
|
||||
ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna,
|
||||
ztrsyl, ztrtrs, ztzrqf, ztzrzf, zung2l,
|
||||
ztrsyl, ztrtrs, ztzrzf, zung2l,
|
||||
zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2,
|
||||
zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2,
|
||||
zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz,
|
||||
|
@ -452,6 +450,139 @@
|
|||
zunbdb5, zunbdb6, zuncsd, zuncsd2by1,
|
||||
zgeqrt, zgeqrt2, zgeqrt3, zgemqrt,
|
||||
ztpqrt, ztpqrt2, ztpmqrt, ztprfb,
|
||||
# functions added for lapack-3.6.0
|
||||
|
||||
cgejsv,
|
||||
cgesvdx,
|
||||
cgesvj,
|
||||
cgetrf2,
|
||||
cgges3,
|
||||
cggev3,
|
||||
cgghd3,
|
||||
cggsvd3,
|
||||
cggsvp3,
|
||||
cgsvj0,
|
||||
cgsvj1,
|
||||
clagge,
|
||||
claghe,
|
||||
clagsy,
|
||||
clahilb,
|
||||
clakf2,
|
||||
clarge,
|
||||
clarnd,
|
||||
claror,
|
||||
clarot,
|
||||
clatm1,
|
||||
clatm2,
|
||||
clatm3,
|
||||
clatm5,
|
||||
clatm6,
|
||||
clatme,
|
||||
clatmr,
|
||||
clatms,
|
||||
clatmt,
|
||||
cpotrf2,
|
||||
csbmv,
|
||||
cspr2,
|
||||
csyr2,
|
||||
cunm22,
|
||||
dbdsvdx,
|
||||
dgesvdx,
|
||||
dgetrf2,
|
||||
dgges3,
|
||||
dggev3,
|
||||
dgghd3,
|
||||
dggsvd3,
|
||||
dggsvp3,
|
||||
dladiv2,
|
||||
dlagge,
|
||||
dlagsy,
|
||||
dlahilb,
|
||||
dlakf2,
|
||||
dlaran,
|
||||
dlarge,
|
||||
dlarnd,
|
||||
dlaror,
|
||||
dlarot,
|
||||
dlatm1,
|
||||
dlatm2,
|
||||
dlatm3,
|
||||
dlatm5,
|
||||
dlatm6,
|
||||
dlatm7,
|
||||
dlatme,
|
||||
dlatmr,
|
||||
dlatms,
|
||||
dlatmt,
|
||||
dorm22,
|
||||
dpotrf2,
|
||||
dsecnd,
|
||||
sbdsvdx,
|
||||
second,
|
||||
sgesvdx,
|
||||
sgetrf2,
|
||||
sgges3,
|
||||
sggev3,
|
||||
sgghd3,
|
||||
sggsvd3,
|
||||
sggsvp3,
|
||||
sladiv2,
|
||||
slagge,
|
||||
slagsy,
|
||||
slahilb,
|
||||
slakf2,
|
||||
slaran,
|
||||
slarge,
|
||||
slarnd,
|
||||
slaror,
|
||||
slarot,
|
||||
slatm1,
|
||||
slatm2,
|
||||
slatm3,
|
||||
slatm5,
|
||||
slatm6,
|
||||
slatm7,
|
||||
slatme,
|
||||
slatmr,
|
||||
slatms,
|
||||
slatmt,
|
||||
sorm22,
|
||||
spotrf2,
|
||||
zgejsv,
|
||||
zgesvdx,
|
||||
zgesvj,
|
||||
zgetrf2,
|
||||
zgges3,
|
||||
zggev3,
|
||||
zgghd3,
|
||||
zggsvd3,
|
||||
zggsvp3,
|
||||
zgsvj0,
|
||||
zgsvj1,
|
||||
zlagge,
|
||||
zlaghe,
|
||||
zlagsy,
|
||||
zlahilb,
|
||||
zlakf2,
|
||||
zlarge,
|
||||
zlarnd,
|
||||
zlaror,
|
||||
zlarot,
|
||||
zlatm1,
|
||||
zlatm2,
|
||||
zlatm3,
|
||||
zlatm5,
|
||||
zlatm6,
|
||||
zlatme,
|
||||
zlatmr,
|
||||
zlatms,
|
||||
zlatmt,
|
||||
zpotrf2,
|
||||
zsbmv,
|
||||
zspr2,
|
||||
zsyr2,
|
||||
zunm22
|
||||
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
|
@ -459,6 +590,13 @@
|
|||
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
|
||||
);
|
||||
|
||||
@lapack_deprecated_objs = (
|
||||
cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp,
|
||||
cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd,
|
||||
cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm,
|
||||
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
|
||||
);
|
||||
|
||||
@lapackeobjs = (
|
||||
# LAPACK C interface routines.
|
||||
#
|
||||
|
@ -682,8 +820,6 @@
|
|||
LAPACKE_cgeqlf_work,
|
||||
LAPACKE_cgeqp3,
|
||||
LAPACKE_cgeqp3_work,
|
||||
LAPACKE_cgeqpf,
|
||||
LAPACKE_cgeqpf_work,
|
||||
LAPACKE_cgeqr2,
|
||||
LAPACKE_cgeqr2_work,
|
||||
LAPACKE_cgeqrf,
|
||||
|
@ -738,10 +874,6 @@
|
|||
LAPACKE_cggqrf_work,
|
||||
LAPACKE_cggrqf,
|
||||
LAPACKE_cggrqf_work,
|
||||
LAPACKE_cggsvd,
|
||||
LAPACKE_cggsvd_work,
|
||||
LAPACKE_cggsvp,
|
||||
LAPACKE_cggsvp_work,
|
||||
LAPACKE_cgtcon,
|
||||
LAPACKE_cgtcon_work,
|
||||
LAPACKE_cgtrfs,
|
||||
|
@ -1186,8 +1318,6 @@
|
|||
LAPACKE_dgeqlf_work,
|
||||
LAPACKE_dgeqp3,
|
||||
LAPACKE_dgeqp3_work,
|
||||
LAPACKE_dgeqpf,
|
||||
LAPACKE_dgeqpf_work,
|
||||
LAPACKE_dgeqr2,
|
||||
LAPACKE_dgeqr2_work,
|
||||
LAPACKE_dgeqrf,
|
||||
|
@ -1244,10 +1374,6 @@
|
|||
LAPACKE_dggqrf_work,
|
||||
LAPACKE_dggrqf,
|
||||
LAPACKE_dggrqf_work,
|
||||
LAPACKE_dggsvd,
|
||||
LAPACKE_dggsvd_work,
|
||||
LAPACKE_dggsvp,
|
||||
LAPACKE_dggsvp_work,
|
||||
LAPACKE_dgtcon,
|
||||
LAPACKE_dgtcon_work,
|
||||
LAPACKE_dgtrfs,
|
||||
|
@ -1676,8 +1802,6 @@
|
|||
LAPACKE_sgeqlf_work,
|
||||
LAPACKE_sgeqp3,
|
||||
LAPACKE_sgeqp3_work,
|
||||
LAPACKE_sgeqpf,
|
||||
LAPACKE_sgeqpf_work,
|
||||
LAPACKE_sgeqr2,
|
||||
LAPACKE_sgeqr2_work,
|
||||
LAPACKE_sgeqrf,
|
||||
|
@ -1734,10 +1858,6 @@
|
|||
LAPACKE_sggqrf_work,
|
||||
LAPACKE_sggrqf,
|
||||
LAPACKE_sggrqf_work,
|
||||
LAPACKE_sggsvd,
|
||||
LAPACKE_sggsvd_work,
|
||||
LAPACKE_sggsvp,
|
||||
LAPACKE_sggsvp_work,
|
||||
LAPACKE_sgtcon,
|
||||
LAPACKE_sgtcon_work,
|
||||
LAPACKE_sgtrfs,
|
||||
|
@ -2158,8 +2278,6 @@
|
|||
LAPACKE_zgeqlf_work,
|
||||
LAPACKE_zgeqp3,
|
||||
LAPACKE_zgeqp3_work,
|
||||
LAPACKE_zgeqpf,
|
||||
LAPACKE_zgeqpf_work,
|
||||
LAPACKE_zgeqr2,
|
||||
LAPACKE_zgeqr2_work,
|
||||
LAPACKE_zgeqrf,
|
||||
|
@ -2214,10 +2332,6 @@
|
|||
LAPACKE_zggqrf_work,
|
||||
LAPACKE_zggrqf,
|
||||
LAPACKE_zggrqf_work,
|
||||
LAPACKE_zggsvd,
|
||||
LAPACKE_zggsvd_work,
|
||||
LAPACKE_zggsvp,
|
||||
LAPACKE_zggsvp_work,
|
||||
LAPACKE_zgtcon,
|
||||
LAPACKE_zgtcon_work,
|
||||
LAPACKE_zgtrfs,
|
||||
|
@ -2707,6 +2821,134 @@
|
|||
LAPACKE_slagsy_work,
|
||||
LAPACKE_zlagsy,
|
||||
LAPACKE_zlagsy_work,
|
||||
## new function from lapack-3.6.0
|
||||
|
||||
LAPACKE_cgejsv,
|
||||
LAPACKE_cgejsv_work,
|
||||
LAPACKE_cgesvdx,
|
||||
LAPACKE_cgesvdx_work,
|
||||
LAPACKE_cgesvj,
|
||||
LAPACKE_cgesvj_work,
|
||||
LAPACKE_cgetrf2,
|
||||
LAPACKE_cgetrf2_work,
|
||||
LAPACKE_cgges3,
|
||||
LAPACKE_cgges3_work,
|
||||
LAPACKE_cggev3,
|
||||
LAPACKE_cggev3_work,
|
||||
LAPACKE_cgghd3,
|
||||
LAPACKE_cgghd3_work,
|
||||
LAPACKE_cggsvd3,
|
||||
LAPACKE_cggsvd3_work,
|
||||
LAPACKE_cggsvp3,
|
||||
LAPACKE_cggsvp3_work,
|
||||
LAPACKE_chetrf_rook,
|
||||
LAPACKE_chetrf_rook_work,
|
||||
LAPACKE_chetrs_rook,
|
||||
LAPACKE_chetrs_rook_work,
|
||||
LAPACKE_clapmt,
|
||||
LAPACKE_clapmt_work,
|
||||
LAPACKE_clascl,
|
||||
LAPACKE_clascl_work,
|
||||
LAPACKE_cpotrf2,
|
||||
LAPACKE_cpotrf2_work,
|
||||
LAPACKE_csytrf_rook,
|
||||
LAPACKE_csytrf_rook_work,
|
||||
LAPACKE_csytrs_rook,
|
||||
LAPACKE_csytrs_rook_work,
|
||||
LAPACKE_cuncsd2by1,
|
||||
LAPACKE_cuncsd2by1_work,
|
||||
LAPACKE_dbdsvdx,
|
||||
LAPACKE_dbdsvdx_work,
|
||||
LAPACKE_dgesvdx,
|
||||
LAPACKE_dgesvdx_work,
|
||||
LAPACKE_dgetrf2,
|
||||
LAPACKE_dgetrf2_work,
|
||||
LAPACKE_dgges3,
|
||||
LAPACKE_dgges3_work,
|
||||
LAPACKE_dggev3,
|
||||
LAPACKE_dggev3_work,
|
||||
LAPACKE_dgghd3,
|
||||
LAPACKE_dgghd3_work,
|
||||
LAPACKE_dggsvd3,
|
||||
LAPACKE_dggsvd3_work,
|
||||
LAPACKE_dggsvp3,
|
||||
LAPACKE_dggsvp3_work,
|
||||
LAPACKE_dlapmt,
|
||||
LAPACKE_dlapmt_work,
|
||||
LAPACKE_dlascl,
|
||||
LAPACKE_dlascl_work,
|
||||
LAPACKE_dorcsd2by1,
|
||||
LAPACKE_dorcsd2by1_work,
|
||||
LAPACKE_dpotrf2,
|
||||
LAPACKE_dpotrf2_work,
|
||||
LAPACKE_dsytrf_rook,
|
||||
LAPACKE_dsytrf_rook_work,
|
||||
LAPACKE_dsytrs_rook,
|
||||
LAPACKE_dsytrs_rook_work,
|
||||
LAPACKE_sbdsvdx,
|
||||
LAPACKE_sbdsvdx_work,
|
||||
LAPACKE_sgesvdx,
|
||||
LAPACKE_sgesvdx_work,
|
||||
LAPACKE_sgetrf2,
|
||||
LAPACKE_sgetrf2_work,
|
||||
LAPACKE_sgges3,
|
||||
LAPACKE_sgges3_work,
|
||||
LAPACKE_sggev3,
|
||||
LAPACKE_sggev3_work,
|
||||
LAPACKE_sgghd3,
|
||||
LAPACKE_sgghd3_work,
|
||||
LAPACKE_sggsvd3,
|
||||
LAPACKE_sggsvd3_work,
|
||||
LAPACKE_sggsvp3,
|
||||
LAPACKE_sggsvp3_work,
|
||||
LAPACKE_slapmt,
|
||||
LAPACKE_slapmt_work,
|
||||
LAPACKE_slascl,
|
||||
LAPACKE_slascl_work,
|
||||
LAPACKE_sorcsd2by1,
|
||||
LAPACKE_sorcsd2by1_work,
|
||||
LAPACKE_spotrf2,
|
||||
LAPACKE_spotrf2_work,
|
||||
LAPACKE_ssytrf_rook,
|
||||
LAPACKE_ssytrf_rook_work,
|
||||
LAPACKE_ssytrs_rook,
|
||||
LAPACKE_ssytrs_rook_work,
|
||||
LAPACKE_stpqrt,
|
||||
LAPACKE_stpqrt_work,
|
||||
LAPACKE_zgejsv,
|
||||
LAPACKE_zgejsv_work,
|
||||
LAPACKE_zgesvdx,
|
||||
LAPACKE_zgesvdx_work,
|
||||
LAPACKE_zgesvj,
|
||||
LAPACKE_zgesvj_work,
|
||||
LAPACKE_zgetrf2,
|
||||
LAPACKE_zgetrf2_work,
|
||||
LAPACKE_zgges3,
|
||||
LAPACKE_zgges3_work,
|
||||
LAPACKE_zggev3,
|
||||
LAPACKE_zggev3_work,
|
||||
LAPACKE_zgghd3,
|
||||
LAPACKE_zgghd3_work,
|
||||
LAPACKE_zggsvd3,
|
||||
LAPACKE_zggsvd3_work,
|
||||
LAPACKE_zggsvp3,
|
||||
LAPACKE_zggsvp3_work,
|
||||
LAPACKE_zhetrf_rook,
|
||||
LAPACKE_zhetrf_rook_work,
|
||||
LAPACKE_zhetrs_rook,
|
||||
LAPACKE_zhetrs_rook_work,
|
||||
LAPACKE_zlapmt,
|
||||
LAPACKE_zlapmt_work,
|
||||
LAPACKE_zlascl,
|
||||
LAPACKE_zlascl_work,
|
||||
LAPACKE_zpotrf2,
|
||||
LAPACKE_zpotrf2_work,
|
||||
LAPACKE_zsytrf_rook,
|
||||
LAPACKE_zsytrf_rook_work,
|
||||
LAPACKE_zsytrs_rook,
|
||||
LAPACKE_zsytrs_rook_work,
|
||||
LAPACKE_zuncsd2by1,
|
||||
LAPACKE_zuncsd2by1_work
|
||||
);
|
||||
|
||||
#These function may need 2 underscores.
|
||||
|
@ -2749,6 +2991,11 @@ if ($ARGV[8] == 1) {
|
|||
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
|
||||
};
|
||||
|
||||
if ($ARGV[11] == 1){
|
||||
#BUILD_LAPACK_DEPRECATED=1
|
||||
@underscore_objs =(@underscore_objs, @lapack_deprecated_objs);
|
||||
}
|
||||
|
||||
} else {
|
||||
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
||||
}
|
||||
|
|
5
f_check
5
f_check
|
@ -1,5 +1,7 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
||||
#
|
||||
# 1. Not specified
|
||||
# 1.1 Automatically detect, then check compiler
|
||||
|
@ -272,8 +274,9 @@ if ($link ne "") {
|
|||
}
|
||||
|
||||
if ($flags =~ /^\-Y/) {
|
||||
next if ($hostos eq 'SunOS');
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
}
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
|
|
42
getarch.c
42
getarch.c
|
@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__sun__)
|
||||
#include <sys/sysinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "POWER5"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
|
||||
#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER6"
|
||||
|
@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "POWER6"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER8)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER8"
|
||||
#define SUBDIRNAME "power"
|
||||
#define ARCHCONFIG "-DPOWER8 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "power8"
|
||||
#define CORENAME "POWER8"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_PPCG4
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
|
@ -819,10 +833,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
|
||||
#define LIBNAME "armv8"
|
||||
#define CORENAME "XGENE1"
|
||||
#else
|
||||
#define CORENAME "ARMV8"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA57
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "ARMV8"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA57 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "cortexa57"
|
||||
#define CORENAME "CORTEXA57"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
|
@ -892,7 +920,7 @@ static int get_num_cores(void) {
|
|||
size_t len;
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__sun__)
|
||||
//returns the number of processors which are currently online
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
|
||||
|
@ -984,7 +1012,9 @@ int main(int argc, char *argv[]){
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if NO_PARALLEL_MAKE==1
|
||||
#ifdef MAKE_NB_JOBS
|
||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
#else
|
||||
#ifndef OS_WINDOWS
|
||||
|
|
|
@ -79,11 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
|||
FLOAT alpha = *ALPHA;
|
||||
FLOAT beta = *BETA;
|
||||
FLOAT *buffer;
|
||||
int buffer_size;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
|
||||
|
@ -134,13 +132,10 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
FLOAT *buffer;
|
||||
blasint lenx, leny;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
blasint info, t;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
|
||||
|
@ -215,43 +210,20 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (incx < 0) x -= (lenx - 1) * incx;
|
||||
if (incy < 0) y -= (leny - 1) * incy;
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
// make it volatile because some gemv implementation (ex: dgemv_n.S)
|
||||
// do not restore all register
|
||||
volatile int stack_alloc_size = 0;
|
||||
//for gemv_n and gemv_t, try to allocate on stack
|
||||
stack_alloc_size = m + n;
|
||||
#ifdef ALIGNED_ACCESS
|
||||
stack_alloc_size += 3;
|
||||
#endif
|
||||
if(stack_alloc_size < 128)
|
||||
//dgemv_n.S require a 128 bytes buffer
|
||||
stack_alloc_size = 128;
|
||||
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||
stack_alloc_size = 0;
|
||||
|
||||
FLOAT stack_buffer[stack_alloc_size];
|
||||
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||
// printf("stack_alloc_size=%d\n", stack_alloc_size);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
buffer_size = m + n + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT) ;
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
nthreads_max = num_cpu_avail(2);
|
||||
nthreads_avail = nthreads_max;
|
||||
|
||||
MNK = (double) m * (double) n;
|
||||
if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
|
||||
nthreads_max = 1;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
nthreads = nthreads_avail;
|
||||
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = nthreads_max;
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
@ -266,14 +238,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
if(!stack_alloc_size){
|
||||
blas_memory_free(buffer);
|
||||
}
|
||||
#else
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
|
|
@ -171,19 +171,14 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (incy < 0) y -= (n - 1) * incy;
|
||||
if (incx < 0) x -= (m - 1) * incx;
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
volatile int stack_alloc_size = m;
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||
stack_alloc_size = 0;
|
||||
FLOAT stack_buffer[stack_alloc_size];
|
||||
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||
#else
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#endif
|
||||
STACK_ALLOC(m, FLOAT, buffer);
|
||||
|
||||
#ifdef SMPTEST
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||
if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = num_cpu_avail(2);
|
||||
else
|
||||
nthreads = 1;
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
@ -198,11 +193,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
if(!stack_alloc_size)
|
||||
#endif
|
||||
blas_memory_free(buffer);
|
||||
|
||||
STACK_FREE(buffer);
|
||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
|
|
@ -95,7 +95,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
s = db / r;
|
||||
z = ONE;
|
||||
if (ada > adb) z = s;
|
||||
if ((ada < adb) && (c != ZERO)) z = ONE / c;
|
||||
if ((ada <= adb) && (c != ZERO)) z = ONE / c;
|
||||
|
||||
*C = c;
|
||||
*S = s;
|
||||
|
|
|
@ -77,12 +77,13 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
|
|||
if (incy < 0) y -= (n - 1) * incy;
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0)
|
||||
nthreads = 1;
|
||||
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
@ -91,6 +91,27 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define MODE (BLAS_XDOUBLE | BLAS_REAL)
|
||||
#elif defined(DOUBLE)
|
||||
#define MODE (BLAS_DOUBLE | BLAS_REAL)
|
||||
#else
|
||||
#define MODE (BLAS_SINGLE | BLAS_REAL)
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX)
|
||||
#elif defined(DOUBLE)
|
||||
#define MODE (BLAS_DOUBLE | BLAS_COMPLEX)
|
||||
#else
|
||||
#define MODE (BLAS_SINGLE | BLAS_COMPLEX)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
#ifndef GEMM3M
|
||||
#ifndef HEMM
|
||||
|
@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO,
|
|||
FLOAT *buffer;
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
int nodes;
|
||||
#endif
|
||||
|
@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
FLOAT *buffer;
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
int nodes;
|
||||
#endif
|
||||
|
@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
|
||||
args.nthreads /= nodes;
|
||||
|
||||
gemm_thread_mn(mode, &args, NULL, NULL,
|
||||
gemm_thread_mn(MODE, &args, NULL, NULL,
|
||||
symm[4 | (side << 1) | uplo ], sa, sb, nodes);
|
||||
|
||||
} else {
|
||||
|
@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
|
||||
#else
|
||||
|
||||
GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
|
||||
GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -77,11 +77,9 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
|||
blasint incy = *INCY;
|
||||
|
||||
FLOAT *buffer;
|
||||
int buffer_size;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
|
||||
|
@ -144,13 +142,10 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
FLOAT *buffer;
|
||||
blasint lenx, leny;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
blasint info, t;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
|
||||
|
@ -236,22 +231,26 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
||||
if (incy < 0) y -= (leny - 1) * incy * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT) ;
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
|
||||
if(trans && stack_alloc_size)
|
||||
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
nthreads_max = num_cpu_avail(2);
|
||||
nthreads_avail = nthreads_max;
|
||||
|
||||
MNK = (double) m * (double) n;
|
||||
if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) ))
|
||||
nthreads_max = 1;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
nthreads = nthreads_avail;
|
||||
if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = nthreads_max;
|
||||
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
@ -267,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
|
|
@ -210,10 +210,14 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||
if (incx < 0) x -= (m - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
STACK_ALLOC(2 * m, FLOAT, buffer);
|
||||
|
||||
#ifdef SMPTEST
|
||||
nthreads = num_cpu_avail(2);
|
||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||
if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = num_cpu_avail(2);
|
||||
else
|
||||
nthreads = 1;
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
@ -245,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
|
|
@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
FLOAT beta_i = BETA[1];
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
FLOAT alpha_r = ALPHA[0];
|
||||
FLOAT alpha_i = ALPHA[1];
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue