Merge branch 'develop'

0.2.20 version
This commit is contained in:
Zhang Xianyi 2017-07-24 12:03:35 +08:00
commit 5dde4e65d3
4371 changed files with 304566 additions and 74537 deletions

18
.gitignore vendored
View File

@ -14,6 +14,21 @@ lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
lapack-netlib/INSTALL/test*
lapack-netlib/TESTING/xeigtstc
lapack-netlib/TESTING/xeigtstd
lapack-netlib/TESTING/xeigtsts
lapack-netlib/TESTING/xeigtstz
lapack-netlib/TESTING/xlintstc
lapack-netlib/TESTING/xlintstd
lapack-netlib/TESTING/xlintstds
lapack-netlib/TESTING/xlintstrfc
lapack-netlib/TESTING/xlintstrfd
lapack-netlib/TESTING/xlintstrfs
lapack-netlib/TESTING/xlintstrfz
lapack-netlib/TESTING/xlintsts
lapack-netlib/TESTING/xlintstz
lapack-netlib/TESTING/xlintstzc
*.so
*.so.*
*.a
@ -69,3 +84,6 @@ test/zblat3
build
build.*
*.swp
benchmark/*.goto
benchmark/smallscaling

View File

@ -2,16 +2,19 @@
## Author: Hank Anderson <hank@statease.com>
##
cmake_minimum_required(VERSION 2.8.4)
cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 19)
set(OpenBLAS_PATCH_VERSION 20)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
enable_language(C)
# Adhere to GNU filesystem layout conventions
include(GNUInstallDirs)
if(MSVC)
set(OpenBLAS_LIBNAME libopenblas)
else()
@ -30,10 +33,20 @@ set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif()
if(BUILD_DEBUG)
set(CMAKE_BUILD_TYPE Debug)
if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
set(CMAKE_BUILD_TYPE
Debug Debug
Release Release
)
else()
set(CMAKE_BUILD_TYPE Release)
if( NOT CMAKE_BUILD_TYPE )
if(BUILD_DEBUG)
set(CMAKE_BUILD_TYPE Debug)
else()
set(CMAKE_BUILD_TYPE Release)
endif()
endif()
endif()
if(BUILD_WITHOUT_CBLAS)
@ -107,9 +120,12 @@ if (${NO_STATIC} AND ${NO_SHARED})
endif ()
#Set default output directory
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
if(MSVC)
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
endif ()
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
@ -129,9 +145,12 @@ if (NOT NO_LAPACKE)
endif ()
endif ()
#Only generate .def for dll on MSVC
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
endif()
# add objects to the openblas lib
@ -141,25 +160,29 @@ include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
endforeach()
enable_testing()
add_subdirectory(utest)
if(NOT MSVC)
#only build shared library for MSVC
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
if (NOT MSVC)
#only build shared library for MSVC
if(SMP)
target_link_libraries(${OpenBLAS_LIBNAME} pthread)
target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
if(SMP)
target_link_libraries(${OpenBLAS_LIBNAME} pthread)
target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
endif()
#build test and ctest
@ -198,3 +221,73 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
#endif
# @touch lib.grd
# Install project
# Install libraries
install(TARGETS ${OpenBLAS_LIBNAME}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
# Install include files
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
ADD_CUSTOM_COMMAND(
OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
)
ADD_CUSTOM_TARGET(genconfig
ALL
DEPENDS openblas_config.h
)
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(genf77blas
ALL
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
)
add_dependencies(genf77blas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(gencblas
ALL
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
)
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT NO_LAPACKE)
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
ADD_CUSTOM_TARGET(genlapacke
COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT MSVC)
install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif()

View File

@ -161,3 +161,10 @@ In chronological order:
* Kaustubh Raste <https://github.com/ksraste/>
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
* Abdelrauf <https://github.com/quickwritereader>
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13

View File

@ -1,4 +1,45 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.20
24-Jul-2017
common:
* Improved CMake support
* Fixed several thread race and locking bugs
* Fixed default LAPACK optimization level
* Updated LAPACK to 3.7.0
* Added ReLAPACK (https://github.com/HPAC/ReLAPACK, make BUILD_RELAPACK=1)
POWER:
* Optimizations for Power9
* Fixed several Power8 assembly bugs
ARM:
* New optimized Vulcan and ThunderX2T99 targets
* Support for ARMV7 SOFT_FP ABI (make ARM_SOFTFP_ABI=1)
* Detect all cpu cores including offline ones
* Fix compilation with CLANG
* Support building a shared library for Android
MIPS:
* Fixed several threading issues
* Fix compilation with CLANG
x86_64:
* Detect Intel Bay Trail and Apollo Lake
* Detect Intel Sky Lake and Kaby Lake
* Detect Intel Knights Landing
* Detect AMD A8, A10, A12 and Ryzen
* Support 64bit builds with Visual Studio
* Fix building with Intel and PGI compilers
* Fix building with MINGW and TDM-GCC
* Fix cmake builds for Haswell and related cpus
* Fix building for Sandybridge with CLANG 3.9
* Add support for the FLANG compiler
IBM Z:
* New target z13 with BLAS3 optimizations
====================================================================
Version 0.2.19
1-Sep-2016

View File

@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
RELA =
ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install blas-test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
all :: libs netlib tests shared
all :: libs netlib $(RELA) tests shared
@echo
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
@echo
@ -81,7 +86,7 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ -215,6 +220,14 @@ ifndef NO_LAPACKE
endif
endif
ifeq ($(NO_LAPACK), 1)
re_lapack :
else
re_lapack :
@$(MAKE) -C relapack
endif
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
@ -278,13 +291,13 @@ lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
make -C $(NETLIB_LAPACK_DIR)/TIMING
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
endif
lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
@ -299,7 +312,7 @@ lapack-runtest:
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
@ -326,6 +339,7 @@ endif
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@$(MAKE) -C relapack clean
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
@echo Done.

View File

@ -1,31 +1,19 @@
# ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
endif

View File

@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif
ifeq ($(CORE), VULCAN)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
endif
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
endif

View File

@ -12,6 +12,7 @@ OPENBLAS_BUILD_DIR := $(CURDIR)
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
.PHONY : install
.NOTPARALLEL : install
@ -25,6 +26,7 @@ install : lib.grd
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@ -50,7 +52,7 @@ ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
@ -64,7 +66,7 @@ endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
@ -91,9 +93,20 @@ ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
endif
endif
#Generating openblas.pc
@echo Generating openblas.pc in $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@echo 'version='$(VERSION) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@echo 'extralib='$(EXTRALIB) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@cat openblas.pc.in >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -43,7 +43,7 @@ endif
ifeq ($(USE_MASS), 1)
# Path to MASS libs, change it if the libs are installed at any other location
MASSPATH = /opt/ibm/xlmass/8.1.3/lib
MASSPATH = /opt/ibm/xlmass/8.1.5/lib
COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.19
VERSION = 0.2.20
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -83,6 +83,9 @@ VERSION = 0.2.19
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
NO_AFFINITY = 1
#NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1

View File

@ -68,6 +68,9 @@ endif
ifeq ($(TARGET), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@ -98,6 +101,9 @@ endif
ifeq ($(TARGET_CORE), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@ -217,7 +223,9 @@ endif
#
ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6
endif
MD5SUM = md5 -r
endif
@ -234,6 +242,10 @@ EXTRALIB += -lm
NO_EXPRECISION = 1
endif
ifeq ($(OSNAME), Android)
EXTRALIB += -lm
endif
ifeq ($(OSNAME), AIX)
EXTRALIB += -lm
endif
@ -406,7 +418,6 @@ CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), CLANG)
$(error OpenBLAS: Clang didn't support OpenMP yet.)
CCOMMON_OPT += -fopenmp
endif
@ -441,12 +452,13 @@ ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL
DYNAMIC_CORE += HASWELL ZEN
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE
DYNAMIC_ARCH =
override DYNAMIC_ARCH=
endif
endif
@ -474,6 +486,23 @@ endif
ifeq ($(ARCH), arm)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
CCOMMON_OPT += -marm
FCOMMON_OPT += -marm
# If softfp abi is mentioned on the command line, force it.
ifeq ($(ARM_SOFTFP_ABI), 1)
CCOMMON_OPT += -mfloat-abi=softfp
FCOMMON_OPT += -mfloat-abi=softfp
endif
ifeq ($(OSNAME), Android)
ifeq ($(ARM_SOFTFP_ABI), 1)
EXTRALIB += -lm
else
EXTRALIB += -Wl,-lm_hard
endif
endif
endif
ifeq ($(ARCH), arm64)
@ -575,6 +604,23 @@ endif
# Fortran Compiler dependent settings
#
ifeq ($(F_COMPILER), FLANG)
CCOMMON_OPT += -DF_INTERFACE_FLANG
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -Wall
else
FCOMMON_OPT += -Wall
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
endif
ifeq ($(F_COMPILER), G77)
CCOMMON_OPT += -DF_INTERFACE_G77
FCOMMON_OPT += -Wall
@ -1002,7 +1048,7 @@ endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(FCOMMON_OPT)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
@ -1083,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
ifneq ($(NO_LAPACKE), 1)
LIB_COMPONENTS += LAPACKE
endif
ifeq ($(BUILD_RELAPACK), 1)
LIB_COMPONENTS += ReLAPACK
endif
endif
ifeq ($(ONLY_CBLAS), 1)

6
Makefile.zarch Normal file
View File

@ -0,0 +1,6 @@
ifeq ($(CORE), Z13)
CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif

View File

@ -51,18 +51,18 @@ The library can be installed as below -
* On Ubuntu:
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.3
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
sudo apt-get update</br>
sudo apt-get install libxlmass-devel.8.1.5</br>
* On RHEL/CentOS:
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.3
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
sudo rpm --import repomd.xml.key</br>
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
sudo yum install libxlmass-devel.8.1.5</br>
After installing MASS library, compile openblas with USE_MASS=1.
@ -106,6 +106,10 @@ Please read GotoBLAS_01Readme.txt
- **ARMV8**: Experimental
- **ARM Cortex-A57**: Experimental
#### IBM zEnterprise System:
- **Z13**: Optimized Level-3 BLAS
### Support OS:
- **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.

View File

@ -34,6 +34,7 @@ BULLDOZER
PILEDRIVER
STEAMROLLER
EXCAVATOR
ZEN
c)VIA CPU:
SSE_GENERIC
@ -80,4 +81,7 @@ ARMV5
8.ARM 64-bit CPU:
ARMV8
CORTEXA57
VULCAN
THUNDERX
THUNDERX2T99

View File

@ -37,6 +37,18 @@ ESSL=/opt/ibm/lib
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
ifneq ($(NO_LAPACK), 1)
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
csymv.goto zsymv.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto
else
GOTO_LAPACK_TARGETS=
endif
ifeq ($(OSNAME), WINNT)
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@ -147,9 +159,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
else
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
@ -162,20 +172,16 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sswap.goto dswap.goto cswap.goto zswap.goto \
sscal.goto dscal.goto cscal.goto zscal.goto \
sasum.goto dasum.goto casum.goto zasum.goto \
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
ssymv.goto dsymv.goto \
chemv.goto zhemv.goto \
chemm.goto zhemm.goto \
cherk.goto zherk.goto \
cher2k.goto zher2k.goto \
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling \
isamax.goto idamax.goto icamax.goto izamax.goto \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \

View File

@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
srandom(getpid());
#endif
fprintf(stderr, " SIZE Time\n");
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
@ -180,7 +180,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.6f secs\n", timeg);
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
srandom(getpid());
#endif
fprintf(stderr, " SIZE Time\n");
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
@ -180,7 +180,10 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.6f secs\n", timeg);
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -2,61 +2,54 @@
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
nfrom <- 128
nto <- 2048
nstep <- 128
loops <- 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
if (length(argv) > 0) {
for (z in 1:length(argv)) {
if (z == 1) {
nfrom <- as.numeric(argv[z])
} else if (z == 2) {
nto <- as.numeric(argv[z])
} else if (z == 3) {
nstep <- as.numeric(argv[z])
} else if (z == 4) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
p <- Sys.getenv("OPENBLAS_LOOPS")
if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
l = 1
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
start <- proc.time()[3]
while ( l <= loops ) {
ev <- eigen(A)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
n <- n + nstep
}

View File

@ -2,62 +2,63 @@
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
nfrom <- 128
nto <- 2048
nstep <- 128
loops <- 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
if (length(argv) > 0) {
for (z in 1:length(argv)) {
if (z == 1) {
nfrom <- as.numeric(argv[z])
} else if (z == 2) {
nto <- as.numeric(argv[z])
} else if (z == 3) {
nstep <- as.numeric(argv[z])
} else if (z == 4) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
p <- Sys.getenv("OPENBLAS_LOOPS")
if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
C <- 1
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
z <- system.time(for (l in 1:loops) {
C <- A %*% B
l <- l + 1
})
l = 1
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
start <- proc.time()[3]
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
while ( l <= loops ) {
C <- A %*% B
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
n <- n + nstep
}

View File

@ -2,62 +2,56 @@
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
nfrom <- 128
nto <- 2048
nstep <- 128
loops <- 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
if (length(argv) > 0) {
for (z in 1:length(argv)) {
if (z == 1) {
nfrom <- as.numeric(argv[z])
} else if (z == 2) {
nto <- as.numeric(argv[z])
} else if (z == 3) {
nstep <- as.numeric(argv[z])
} else if (z == 4) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
p <- Sys.getenv("OPENBLAS_LOOPS")
if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
z <- system.time(for (l in 1:loops) {
solve(A, B)
})
l = 1
mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
start <- proc.time()[3]
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
while ( l <= loops ) {
solve(A,B)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
n <- n + nstep
}

15
c_check
View File

@ -10,6 +10,7 @@ $hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
@ -34,7 +35,7 @@ if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /(.*-)(.*)/) {
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
@ -72,6 +73,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$defined = 0;
@ -96,6 +98,11 @@ if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "zarch") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;
@ -187,6 +194,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
@ -234,6 +242,11 @@ $linker_a = "";
$linker_L .= "-Wl,". $flags . " "
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if (
($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)

View File

@ -73,7 +73,7 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER")
endif ()
if (NOT NO_AVX2)
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL")
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL ZEN")
endif ()
endif ()

View File

@ -73,6 +73,10 @@ if (${ARCH} STREQUAL "X86")
set(ARCH x86)
endif ()
if (${ARCH} MATCHES "ppc")
set(ARCH power)
endif ()
set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
set(COMPILER_ID "GCC")
@ -87,3 +91,8 @@ file(WRITE ${TARGET_CONF}
"#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n")
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
file(APPEND ${TARGET_CONF}
"#define OS_WINNT\t1\n")
endif ()

View File

@ -3,6 +3,21 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "G77")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")

View File

@ -2,7 +2,7 @@
set(ALLAUX
ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f iparam2stage.F chla_transtype.f
../INSTALL/ilaver.f ../INSTALL/slamch.f
)
@ -26,7 +26,7 @@ set(SCLAUX
)
set(DZLAUX
dbdsdc.f
dbdsdc.f dbdsvdx.f
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
@ -42,20 +42,28 @@ set(DZLAUX
dsteqr.f dsterf.f dlaisnan.f disnan.f
dlartgp.f dlartgs.f
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f
dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f
dgetsls.f dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f
dtplqt2.f dtpmlqt.f dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f
dsytf2_rk.f dlasyf_rk.f dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f
dsytri_3x.f dsysv_rk.f dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f
dsbevx_2stage.f dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f
dsyevx_2stage.f dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F
dsytrd_sy2sb.f dlarfy.f
)
set(SLASRC
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
sgetc2.f sgetri.f
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f
sgetc2.f sgetri.f sgetrf2.f
sggbak.f sggbal.f sgghd3.f sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f
sggglm.f sgghrd.f sgglse.f sggqrf.f
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sggrqf.f DEPRECATED/sggsvd.f sggsvd3.f DEPRECATED/sggsvp.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
@ -72,7 +80,7 @@ set(SLASRC
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f
sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
spbstf.f spbsv.f spbsvx.f
@ -96,7 +104,7 @@ set(SLASRC
stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
stptrs.f
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
@ -106,9 +114,16 @@ set(SLASRC
sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f
sgelq.f sgelqt.f sgelqt3.f sgemlq.f sgemlqt.f sgemqr.f sgeqr.f sgetsls.f
slamswlq.f slamtsqr.f slaswlq.f slatsqr.f stplqt.f stplqt2.f stpmlqt.f
ssysv_aa.f ssytrf_aa.f ssytrs_aa.f slasyf_aa.f ssytf2_rk.f slasyf_rk.f
ssytrf_rk.f ssytrs_3.f ssycon_3.f ssytri_3.f ssytri_3x.f ssysv_rk.f
ssb2st_kernels.f ssbev_2stage.f ssbevd_2stage.f ssbevx_2stage.f
ssyev_2stage.f ssyevd_2stage.f ssyevr_2stage.f ssyevx_2stage.f
ssygv_2stage.f ssytrd_2stage.f ssytrd_sb2st.F ssytrd_sy2sb.f slarfy.f
)
set(DSLASRC spotrs.f)
set(DSLASRC spotrs.f spotrf2.f)
set(CLASRC
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
@ -165,7 +180,7 @@ set(CLASRC
ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
ctprfs.f ctptri.f
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
@ -178,6 +193,14 @@ set(CLASRC
cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f
cgelq.f cgelqt.f cgelqt3.f cgemlq.f cgemlqt.f cgemqr.f cgeqr.f cgetsls.f
clamswlq.f clamtsqr.f claswlq.f clatsqr.f ctplqt.f ctplqt2.f ctpmlqt.f
chesv_aa.f chetrf_aa.f chetrs_aa.f clahef_aa.f csytf2_rk.f clasyf_rk.f
csytrf_rk.f csytrs_3.f csycon_3.f csytri_3.f csytri_3x.f csysv_rk.f
chetf2_rk.f clahef_rk.f chetrf_rk.f chetrs_3.f checon_3.f chetri_3.f
chetri_3x.f chesv_rk.f chb2st_kernels.f chbev_2stage.f chbevd_2stage.f
chbevx_2stage.f cheev_2stage.f cheevd_2stage.f cheevr_2stage.f cheevx_2stage.f
chegv_2stage.f chetrd_2stage.f chetrd_hb2st.F chetrd_he2hb.f clarfy.f
)
set(ZCLASRC cpotrs.f)
@ -189,11 +212,11 @@ set(DLASRC
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
dgetc2.f dgetri.f
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
dggglm.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f
dgetc2.f dgetri.f dgetrf2.f
dggbak.f dggbal.f dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f
dggglm.f dgghd3.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f dggsvd3.f dggsvp3.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
@ -210,12 +233,12 @@ set(DLASRC
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f
dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
dpbstf.f dpbsv.f dpbsvx.f
dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f
dposvx.f dpotrs.f dpstrf.f dpstf2.f
dposvx.f dpotrf2.f dpotrs.f dpstrf.f dpstf2.f
dppcon.f dppequ.f
dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f
dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f
@ -234,7 +257,7 @@ set(DLASRC
dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
dtptrs.f
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
@ -245,20 +268,28 @@ set(DLASRC
dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f
dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f dgetsls.f
dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f dtplqt2.f dtpmlqt.f
dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f dsytf2_rk.f dlasyf_rk.f
dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f dsytri_3x.f dsysv_rk.f
dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f dsbevx_2stage.f
dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f dsyevx_2stage.f
dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F dsytrd_sy2sb.f dlarfy.f
)
set(ZLASRC
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgejsv.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
zgetri.f
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
zgghrd.f zgglse.f zggqrf.f zggrqf.f
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvj.f zgesvx.f zgetc2.f
zgetri.f zgetrf2.f
zggbak.f zggbal.f zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f zggglm.f
zgghd3.f zgghrd.f zgglse.f zggqrf.f zggrqf.f
DEPRECATED/zggsvd.f zggsvd3.f DEPRECATED/zggsvp.f zggsvp3.f
zgsvj0.f zgsvj1.f
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
@ -287,28 +318,28 @@ set(ZLASRC
zlarfg.f zlarft.f zlarfgp.f
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f zlasyf.f zlasyf_rook.f
zlassq.f zlasyf.f zlasyf_rook.f zlasyf_aa.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
zposv.f zposvx.f zpotrf2.f zpotrs.f zpstrf.f zpstf2.f
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
zrot.f zspcon.f zsprfs.f zspsv.f
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
zstegr.f zstein.f zsteqr.f
zsycon.f
zsycon.f zsysv_aa.f
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f
zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f
zsyswapr.f zsytrs.f zsytrs_aa.f zsytrs2.f zsyconv.f
zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f
zsytri_rook.f zsycon_rook.f zsysv_rook.f
ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
ztprfs.f ztptri.f
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunm22.f zunml2.f
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
zunmtr.f zupgtr.f
zupmtr.f izmax1.f dzsum1.f zstemr.f
@ -320,6 +351,15 @@ set(ZLASRC
zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f
zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f
zgelq.f zgelqt.f zgelqt3.f zgemlq.f zgemlqt.f zgemqr.f zgeqr.f zgetsls.f
zlamswlq.f zlamtsqr.f zlaswlq.f zlatsqr.f ztplqt.f ztplqt2.f ztpmlqt.f
zhesv_aa.f zhetrf_aa.f zhetrs_aa.f zlahef_aa.f zsytf2_rk.f zlasyf_rk.f
zsytrf_aa.f zsytrf_rk.f zsytrs_3.f zsycon_3.f zsytri_3.f zsytri_3x.f zsysv_rk.f
zhetf2_rk.f zlahef_rk.f zhetrf_rk.f zhetrs_3.f zhecon_3.f zhetri_3.f
zhetri_3x.f zhesv_rk.f zhb2st_kernels.f zhbev_2stage.f zhbevd_2stage.f
zhbevx_2stage.f zheev_2stage.f zheevd_2stage.f zheevr_2stage.f
zheevx_2stage.f zhegv_2stage.f zhetrd_2stage.f zhetrd_hb2st.F zhetrd_he2hb.f
zlarfy.f
)
set(LA_REL_SRC ${ALLAUX})

View File

@ -44,6 +44,8 @@ set(C_SRC
lapacke_cgeevx_work.c
lapacke_cgehrd.c
lapacke_cgehrd_work.c
lapacke_cgejsv.c
lapacke_cgejsv_work.c
lapacke_cgelq2.c
lapacke_cgelq2_work.c
lapacke_cgelqf.c
@ -56,14 +58,14 @@ set(C_SRC
lapacke_cgelss_work.c
lapacke_cgelsy.c
lapacke_cgelsy_work.c
lapacke_cgemqr.c
lapacke_cgemqr_work.c
lapacke_cgemqrt.c
lapacke_cgemqrt_work.c
lapacke_cgeqlf.c
lapacke_cgeqlf_work.c
lapacke_cgeqp3.c
lapacke_cgeqp3_work.c
lapacke_cgeqpf.c
lapacke_cgeqpf_work.c
lapacke_cgeqr2.c
lapacke_cgeqr2_work.c
lapacke_cgeqrf.c
@ -86,42 +88,56 @@ set(C_SRC
lapacke_cgesv_work.c
lapacke_cgesvd.c
lapacke_cgesvd_work.c
lapacke_cgesvdx.c
lapacke_cgesvdx_work.c
lapacke_cgesvj.c
lapacke_cgesvj_work.c
lapacke_cgesvx.c
lapacke_cgesvx_work.c
lapacke_cgetf2.c
lapacke_cgetf2_work.c
lapacke_cgetrf.c
lapacke_cgetrf_work.c
lapacke_cgetrf2.c
lapacke_cgetrf2_work.c
lapacke_cgetri.c
lapacke_cgetri_work.c
lapacke_cgetrs.c
lapacke_cgetrs_work.c
lapacke_cgetsls.c
lapacke_cgetsls_work.c
lapacke_cggbak.c
lapacke_cggbak_work.c
lapacke_cggbal.c
lapacke_cggbal_work.c
lapacke_cgges.c
lapacke_cgges_work.c
lapacke_cgges3.c
lapacke_cgges3_work.c
lapacke_cggesx.c
lapacke_cggesx_work.c
lapacke_cggev.c
lapacke_cggev_work.c
lapacke_cggev3.c
lapacke_cggev3_work.c
lapacke_cggevx.c
lapacke_cggevx_work.c
lapacke_cggglm.c
lapacke_cggglm_work.c
lapacke_cgghrd.c
lapacke_cgghrd_work.c
lapacke_cgghd3.c
lapacke_cgghd3_work.c
lapacke_cgglse.c
lapacke_cgglse_work.c
lapacke_cggqrf.c
lapacke_cggqrf_work.c
lapacke_cggrqf.c
lapacke_cggrqf_work.c
lapacke_cggsvd.c
lapacke_cggsvd_work.c
lapacke_cggsvp.c
lapacke_cggsvp_work.c
lapacke_cggsvd3.c
lapacke_cggsvd3_work.c
lapacke_cggsvp3.c
lapacke_cggsvp3_work.c
lapacke_cgtcon.c
lapacke_cgtcon_work.c
lapacke_cgtrfs.c
@ -140,6 +156,12 @@ set(C_SRC
lapacke_chbevd_work.c
lapacke_chbevx.c
lapacke_chbevx_work.c
lapacke_chbev_2stage.c
lapacke_chbev_2stage_work.c
lapacke_chbevd_2stage.c
lapacke_chbevd_2stage_work.c
lapacke_chbevx_2stage.c
lapacke_chbevx_2stage_work.c
lapacke_chbgst.c
lapacke_chbgst_work.c
lapacke_chbgv.c
@ -152,6 +174,8 @@ set(C_SRC
lapacke_chbtrd_work.c
lapacke_checon.c
lapacke_checon_work.c
lapacke_checon_3.c
lapacke_checon_3_work.c
lapacke_cheequb.c
lapacke_cheequb_work.c
lapacke_cheev.c
@ -162,10 +186,20 @@ set(C_SRC
lapacke_cheevr_work.c
lapacke_cheevx.c
lapacke_cheevx_work.c
lapacke_cheev_2stage.c
lapacke_cheev_2stage_work.c
lapacke_cheevd_2stage.c
lapacke_cheevd_2stage_work.c
lapacke_cheevr_2stage.c
lapacke_cheevr_2stage_work.c
lapacke_cheevx_2stage.c
lapacke_cheevx_2stage_work.c
lapacke_chegst.c
lapacke_chegst_work.c
lapacke_chegv.c
lapacke_chegv_work.c
lapacke_chegv_2stage.c
lapacke_chegv_2stage_work.c
lapacke_chegvd.c
lapacke_chegvd_work.c
lapacke_chegvx.c
@ -174,6 +208,10 @@ set(C_SRC
lapacke_cherfs_work.c
lapacke_chesv.c
lapacke_chesv_work.c
lapacke_chesv_aa.c
lapacke_chesv_aa_work.c
lapacke_chesv_rk.c
lapacke_chesv_rk_work.c
lapacke_chesvx.c
lapacke_chesvx_work.c
lapacke_cheswapr.c
@ -181,17 +219,31 @@ set(C_SRC
lapacke_chetrd.c
lapacke_chetrd_work.c
lapacke_chetrf.c
lapacke_chetrf_rook.c
lapacke_chetrf_work.c
lapacke_chetrf_rook_work.c
lapacke_chetrf_aa.c
lapacke_chetrf_aa_work.c
lapacke_chetrf_rk.c
lapacke_chetrf_rk_work.c
lapacke_chetri.c
lapacke_chetri2.c
lapacke_chetri2_work.c
lapacke_chetri_3.c
lapacke_chetri_3_work.c
lapacke_chetri2x.c
lapacke_chetri2x_work.c
lapacke_chetri_work.c
lapacke_chetrs.c
lapacke_chetrs_rook.c
lapacke_chetrs2.c
lapacke_chetrs2_work.c
lapacke_chetrs_work.c
lapacke_chetrs_rook_work.c
lapacke_chetrs_aa.c
lapacke_chetrs_aa_work.c
lapacke_chetrs_3.c
lapacke_chetrs_3_work.c
lapacke_chfrk.c
lapacke_chfrk_work.c
lapacke_chgeqz.c
@ -250,6 +302,8 @@ set(C_SRC
lapacke_clantr_work.c
lapacke_clapmr.c
lapacke_clapmr_work.c
lapacke_clapmt.c
lapacke_clapmt_work.c
lapacke_clarfb.c
lapacke_clarfb_work.c
lapacke_clarfg.c
@ -260,6 +314,8 @@ set(C_SRC
lapacke_clarfx_work.c
lapacke_clarnv.c
lapacke_clarnv_work.c
lapacke_clascl.c
lapacke_clascl_work.c
lapacke_claset.c
lapacke_claset_work.c
lapacke_claswp.c
@ -302,6 +358,8 @@ set(C_SRC
lapacke_cposvx_work.c
lapacke_cpotrf.c
lapacke_cpotrf_work.c
lapacke_cpotrf2.c
lapacke_cpotrf2_work.c
lapacke_cpotri.c
lapacke_cpotri_work.c
lapacke_cpotrs.c
@ -364,6 +422,8 @@ set(C_SRC
lapacke_csteqr_work.c
lapacke_csycon.c
lapacke_csycon_work.c
lapacke_csycon_3.c
lapacke_csycon_3_work.c
lapacke_csyconv.c
lapacke_csyconv_work.c
lapacke_csyequb.c
@ -374,22 +434,40 @@ set(C_SRC
lapacke_csysv_rook.c
lapacke_csysv_rook_work.c
lapacke_csysv_work.c
lapacke_csysv_aa.c
lapacke_csysv_aa_work.c
lapacke_csysv_rk.c
lapacke_csysv_rk_work.c
lapacke_csysvx.c
lapacke_csysvx_work.c
lapacke_csyswapr.c
lapacke_csyswapr_work.c
lapacke_csytrf.c
lapacke_csytrf_work.c
lapacke_csytrf_rook.c
lapacke_csytrf_rook_work.c
lapacke_csytrf_aa.c
lapacke_csytrf_aa_work.c
lapacke_csytrf_rk.c
lapacke_csytrf_rk_work.c
lapacke_csytri.c
lapacke_csytri2.c
lapacke_csytri2_work.c
lapacke_csytri_3.c
lapacke_csytri_3_work.c
lapacke_csytri2x.c
lapacke_csytri2x_work.c
lapacke_csytri_work.c
lapacke_csytrs.c
lapacke_csytrs_rook.c
lapacke_csytrs2.c
lapacke_csytrs2_work.c
lapacke_csytrs_work.c
lapacke_csytrs_rook_work.c
lapacke_csytrs_aa.c
lapacke_csytrs_aa_work.c
lapacke_csytrs_3.c
lapacke_csytrs_3_work.c
lapacke_ctbcon.c
lapacke_ctbcon_work.c
lapacke_ctbrfs.c
@ -464,6 +542,8 @@ set(C_SRC
lapacke_cunbdb_work.c
lapacke_cuncsd.c
lapacke_cuncsd_work.c
lapacke_cuncsd2by1.c
lapacke_cuncsd2by1_work.c
lapacke_cungbr.c
lapacke_cungbr_work.c
lapacke_cunghr.c
@ -505,6 +585,8 @@ set(DSRC
lapacke_dbbcsd_work.c
lapacke_dbdsdc.c
lapacke_dbdsdc_work.c
lapacke_dbdsvdx.c
lapacke_dbdsvdx_work.c
lapacke_dbdsqr.c
lapacke_dbdsqr_work.c
lapacke_ddisna.c
@ -563,14 +645,14 @@ set(DSRC
lapacke_dgelss_work.c
lapacke_dgelsy.c
lapacke_dgelsy_work.c
lapacke_dgemqr.c
lapacke_dgemqr_work.c
lapacke_dgemqrt.c
lapacke_dgemqrt_work.c
lapacke_dgeqlf.c
lapacke_dgeqlf_work.c
lapacke_dgeqp3.c
lapacke_dgeqp3_work.c
lapacke_dgeqpf.c
lapacke_dgeqpf_work.c
lapacke_dgeqr2.c
lapacke_dgeqr2_work.c
lapacke_dgeqrf.c
@ -593,6 +675,8 @@ set(DSRC
lapacke_dgesv_work.c
lapacke_dgesvd.c
lapacke_dgesvd_work.c
lapacke_dgesvdx.c
lapacke_dgesvdx_work.c
lapacke_dgesvj.c
lapacke_dgesvj_work.c
lapacke_dgesvx.c
@ -601,36 +685,46 @@ set(DSRC
lapacke_dgetf2_work.c
lapacke_dgetrf.c
lapacke_dgetrf_work.c
lapacke_dgetrf2.c
lapacke_dgetrf2_work.c
lapacke_dgetri.c
lapacke_dgetri_work.c
lapacke_dgetrs.c
lapacke_dgetrs_work.c
lapacke_dgetsls.c
lapacke_dgetsls_work.c
lapacke_dggbak.c
lapacke_dggbak_work.c
lapacke_dggbal.c
lapacke_dggbal_work.c
lapacke_dgges.c
lapacke_dgges_work.c
lapacke_dgges3.c
lapacke_dgges3_work.c
lapacke_dggesx.c
lapacke_dggesx_work.c
lapacke_dggev.c
lapacke_dggev_work.c
lapacke_dggev3.c
lapacke_dggev3_work.c
lapacke_dggevx.c
lapacke_dggevx_work.c
lapacke_dggglm.c
lapacke_dggglm_work.c
lapacke_dgghrd.c
lapacke_dgghrd_work.c
lapacke_dgghd3.c
lapacke_dgghd3_work.c
lapacke_dgglse.c
lapacke_dgglse_work.c
lapacke_dggqrf.c
lapacke_dggqrf_work.c
lapacke_dggrqf.c
lapacke_dggrqf_work.c
lapacke_dggsvd.c
lapacke_dggsvd_work.c
lapacke_dggsvp.c
lapacke_dggsvp_work.c
lapacke_dggsvd3.c
lapacke_dggsvd3_work.c
lapacke_dggsvp3.c
lapacke_dggsvp3_work.c
lapacke_dgtcon.c
lapacke_dgtcon_work.c
lapacke_dgtrfs.c
@ -665,6 +759,8 @@ set(DSRC
lapacke_dlantr_work.c
lapacke_dlapmr.c
lapacke_dlapmr_work.c
lapacke_dlapmt.c
lapacke_dlapmt_work.c
lapacke_dlapy2.c
lapacke_dlapy2_work.c
lapacke_dlapy3.c
@ -683,6 +779,8 @@ set(DSRC
lapacke_dlartgp_work.c
lapacke_dlartgs.c
lapacke_dlartgs_work.c
lapacke_dlascl.c
lapacke_dlascl_work.c
lapacke_dlaset.c
lapacke_dlaset_work.c
lapacke_dlasrt.c
@ -697,6 +795,8 @@ set(DSRC
lapacke_dopmtr_work.c
lapacke_dorbdb.c
lapacke_dorbdb_work.c
lapacke_dorcsd2by1.c
lapacke_dorcsd2by1_work.c
lapacke_dorcsd.c
lapacke_dorcsd_work.c
lapacke_dorgbr.c
@ -765,6 +865,8 @@ set(DSRC
lapacke_dposvx_work.c
lapacke_dpotrf.c
lapacke_dpotrf_work.c
lapacke_dpotrf2.c
lapacke_dpotrf2_work.c
lapacke_dpotri.c
lapacke_dpotri_work.c
lapacke_dpotrs.c
@ -807,6 +909,12 @@ set(DSRC
lapacke_dsbevd_work.c
lapacke_dsbevx.c
lapacke_dsbevx_work.c
lapacke_dsbev_2stage.c
lapacke_dsbev_2stage_work.c
lapacke_dsbevd_2stage.c
lapacke_dsbevd_2stage_work.c
lapacke_dsbevx_2stage.c
lapacke_dsbevx_2stage_work.c
lapacke_dsbgst.c
lapacke_dsbgst_work.c
lapacke_dsbgv.c
@ -877,6 +985,8 @@ set(DSRC
lapacke_dstevx_work.c
lapacke_dsycon.c
lapacke_dsycon_work.c
lapacke_dsycon_3.c
lapacke_dsycon_3_work.c
lapacke_dsyconv.c
lapacke_dsyconv_work.c
lapacke_dsyequb.c
@ -889,10 +999,20 @@ set(DSRC
lapacke_dsyevr_work.c
lapacke_dsyevx.c
lapacke_dsyevx_work.c
lapacke_dsyev_2stage.c
lapacke_dsyev_2stage_work.c
lapacke_dsyevd_2stage.c
lapacke_dsyevd_2stage_work.c
lapacke_dsyevr_2stage.c
lapacke_dsyevr_2stage_work.c
lapacke_dsyevx_2stage.c
lapacke_dsyevx_2stage_work.c
lapacke_dsygst.c
lapacke_dsygst_work.c
lapacke_dsygv.c
lapacke_dsygv_work.c
lapacke_dsygv_2stage.c
lapacke_dsygv_2stage_work.c
lapacke_dsygvd.c
lapacke_dsygvd_work.c
lapacke_dsygvx.c
@ -903,6 +1023,10 @@ set(DSRC
lapacke_dsysv_rook.c
lapacke_dsysv_rook_work.c
lapacke_dsysv_work.c
lapacke_dsysv_aa.c
lapacke_dsysv_aa_work.c
lapacke_dsysv_rk.c
lapacke_dsysv_rk_work.c
lapacke_dsysvx.c
lapacke_dsysvx_work.c
lapacke_dsyswapr.c
@ -911,16 +1035,30 @@ set(DSRC
lapacke_dsytrd_work.c
lapacke_dsytrf.c
lapacke_dsytrf_work.c
lapacke_dsytrf_rook.c
lapacke_dsytrf_rook_work.c
lapacke_dsytrf_aa.c
lapacke_dsytrf_aa_work.c
lapacke_dsytrf_rk.c
lapacke_dsytrf_rk_work.c
lapacke_dsytri.c
lapacke_dsytri2.c
lapacke_dsytri2_work.c
lapacke_dsytri_3.c
lapacke_dsytri_3_work.c
lapacke_dsytri2x.c
lapacke_dsytri2x_work.c
lapacke_dsytri_work.c
lapacke_dsytrs.c
lapacke_dsytrs_rook.c
lapacke_dsytrs2.c
lapacke_dsytrs2_work.c
lapacke_dsytrs_aa.c
lapacke_dsytrs_aa_work.c
lapacke_dsytrs_3.c
lapacke_dsytrs_3_work.c
lapacke_dsytrs_work.c
lapacke_dsytrs_rook_work.c
lapacke_dtbcon.c
lapacke_dtbcon_work.c
lapacke_dtbrfs.c
@ -998,6 +1136,8 @@ set(SSRC
lapacke_sbbcsd_work.c
lapacke_sbdsdc.c
lapacke_sbdsdc_work.c
lapacke_sbdsvdx.c
lapacke_sbdsvdx_work.c
lapacke_sbdsqr.c
lapacke_sbdsqr_work.c
lapacke_sdisna.c
@ -1056,14 +1196,14 @@ set(SSRC
lapacke_sgelss_work.c
lapacke_sgelsy.c
lapacke_sgelsy_work.c
lapacke_sgemqr.c
lapacke_sgemqr_work.c
lapacke_sgemqrt.c
lapacke_sgemqrt_work.c
lapacke_sgeqlf.c
lapacke_sgeqlf_work.c
lapacke_sgeqp3.c
lapacke_sgeqp3_work.c
lapacke_sgeqpf.c
lapacke_sgeqpf_work.c
lapacke_sgeqr2.c
lapacke_sgeqr2_work.c
lapacke_sgeqrf.c
@ -1086,6 +1226,8 @@ set(SSRC
lapacke_sgesv_work.c
lapacke_sgesvd.c
lapacke_sgesvd_work.c
lapacke_sgesvdx.c
lapacke_sgesvdx_work.c
lapacke_sgesvj.c
lapacke_sgesvj_work.c
lapacke_sgesvx.c
@ -1094,36 +1236,46 @@ set(SSRC
lapacke_sgetf2_work.c
lapacke_sgetrf.c
lapacke_sgetrf_work.c
lapacke_sgetrf2.c
lapacke_sgetrf2_work.c
lapacke_sgetri.c
lapacke_sgetri_work.c
lapacke_sgetrs.c
lapacke_sgetrs_work.c
lapacke_sgetsls.c
lapacke_sgetsls_work.c
lapacke_sggbak.c
lapacke_sggbak_work.c
lapacke_sggbal.c
lapacke_sggbal_work.c
lapacke_sgges.c
lapacke_sgges_work.c
lapacke_sgges3.c
lapacke_sgges3_work.c
lapacke_sggesx.c
lapacke_sggesx_work.c
lapacke_sggev.c
lapacke_sggev_work.c
lapacke_sggev3.c
lapacke_sggev3_work.c
lapacke_sggevx.c
lapacke_sggevx_work.c
lapacke_sggglm.c
lapacke_sggglm_work.c
lapacke_sgghrd.c
lapacke_sgghrd_work.c
lapacke_sgghd3.c
lapacke_sgghd3_work.c
lapacke_sgglse.c
lapacke_sgglse_work.c
lapacke_sggqrf.c
lapacke_sggqrf_work.c
lapacke_sggrqf.c
lapacke_sggrqf_work.c
lapacke_sggsvd.c
lapacke_sggsvd_work.c
lapacke_sggsvp.c
lapacke_sggsvp_work.c
lapacke_sggsvd3.c
lapacke_sggsvd3_work.c
lapacke_sggsvp3.c
lapacke_sggsvp3_work.c
lapacke_sgtcon.c
lapacke_sgtcon_work.c
lapacke_sgtrfs.c
@ -1158,6 +1310,8 @@ set(SSRC
lapacke_slantr_work.c
lapacke_slapmr.c
lapacke_slapmr_work.c
lapacke_slapmt.c
lapacke_slapmt_work.c
lapacke_slapy2.c
lapacke_slapy2_work.c
lapacke_slapy3.c
@ -1176,6 +1330,8 @@ set(SSRC
lapacke_slartgp_work.c
lapacke_slartgs.c
lapacke_slartgs_work.c
lapacke_slascl.c
lapacke_slascl_work.c
lapacke_slaset.c
lapacke_slaset_work.c
lapacke_slasrt.c
@ -1192,6 +1348,8 @@ set(SSRC
lapacke_sorbdb_work.c
lapacke_sorcsd.c
lapacke_sorcsd_work.c
lapacke_sorcsd2by1.c
lapacke_sorcsd2by1_work.c
lapacke_sorgbr.c
lapacke_sorgbr_work.c
lapacke_sorghr.c
@ -1258,6 +1416,8 @@ set(SSRC
lapacke_sposvx_work.c
lapacke_spotrf.c
lapacke_spotrf_work.c
lapacke_spotrf2.c
lapacke_spotrf2_work.c
lapacke_spotri.c
lapacke_spotri_work.c
lapacke_spotrs.c
@ -1300,6 +1460,12 @@ set(SSRC
lapacke_ssbevd_work.c
lapacke_ssbevx.c
lapacke_ssbevx_work.c
lapacke_ssbev_2stage.c
lapacke_ssbev_2stage_work.c
lapacke_ssbevd_2stage.c
lapacke_ssbevd_2stage_work.c
lapacke_ssbevx_2stage.c
lapacke_ssbevx_2stage_work.c
lapacke_ssbgst.c
lapacke_ssbgst_work.c
lapacke_ssbgv.c
@ -1366,6 +1532,8 @@ set(SSRC
lapacke_sstevx_work.c
lapacke_ssycon.c
lapacke_ssycon_work.c
lapacke_ssycon_3.c
lapacke_ssycon_3_work.c
lapacke_ssyconv.c
lapacke_ssyconv_work.c
lapacke_ssyequb.c
@ -1378,10 +1546,20 @@ set(SSRC
lapacke_ssyevr_work.c
lapacke_ssyevx.c
lapacke_ssyevx_work.c
lapacke_ssyev_2stage.c
lapacke_ssyev_2stage_work.c
lapacke_ssyevd_2stage.c
lapacke_ssyevd_2stage_work.c
lapacke_ssyevr_2stage.c
lapacke_ssyevr_2stage_work.c
lapacke_ssyevx_2stage.c
lapacke_ssyevx_2stage_work.c
lapacke_ssygst.c
lapacke_ssygst_work.c
lapacke_ssygv.c
lapacke_ssygv_work.c
lapacke_ssygv_2stage.c
lapacke_ssygv_2stage_work.c
lapacke_ssygvd.c
lapacke_ssygvd_work.c
lapacke_ssygvx.c
@ -1392,6 +1570,10 @@ set(SSRC
lapacke_ssysv_rook.c
lapacke_ssysv_rook_work.c
lapacke_ssysv_work.c
lapacke_ssysv_aa.c
lapacke_ssysv_aa_work.c
lapacke_ssysv_rk.c
lapacke_ssysv_rk_work.c
lapacke_ssysvx.c
lapacke_ssysvx_work.c
lapacke_ssyswapr.c
@ -1400,16 +1582,30 @@ set(SSRC
lapacke_ssytrd_work.c
lapacke_ssytrf.c
lapacke_ssytrf_work.c
lapacke_ssytrf_rook.c
lapacke_ssytrf_rook_work.c
lapacke_ssytrf_aa.c
lapacke_ssytrf_aa_work.c
lapacke_ssytrf_rk.c
lapacke_ssytrf_rk_work.c
lapacke_ssytri.c
lapacke_ssytri2.c
lapacke_ssytri2_work.c
lapacke_ssytri_3.c
lapacke_ssytri_3_work.c
lapacke_ssytri2x.c
lapacke_ssytri2x_work.c
lapacke_ssytri_work.c
lapacke_ssytrs.c
lapacke_ssytrs_rook.c
lapacke_ssytrs2.c
lapacke_ssytrs2_work.c
lapacke_ssytrs_aa.c
lapacke_ssytrs_aa_work.c
lapacke_ssytrs_3.c
lapacke_ssytrs_3_work.c
lapacke_ssytrs_work.c
lapacke_ssytrs_rook_work.c
lapacke_stbcon.c
lapacke_stbcon_work.c
lapacke_stbrfs.c
@ -1440,6 +1636,8 @@ set(SSRC
lapacke_stpcon_work.c
lapacke_stpmqrt.c
lapacke_stpmqrt_work.c
lapacke_stpqrt.c
lapacke_stpqrt_work.c
lapacke_stpqrt2.c
lapacke_stpqrt2_work.c
lapacke_stprfb.c
@ -1529,6 +1727,8 @@ set(ZSRC
lapacke_zgeevx_work.c
lapacke_zgehrd.c
lapacke_zgehrd_work.c
lapacke_zgejsv.c
lapacke_zgejsv_work.c
lapacke_zgelq2.c
lapacke_zgelq2_work.c
lapacke_zgelqf.c
@ -1541,14 +1741,14 @@ set(ZSRC
lapacke_zgelss_work.c
lapacke_zgelsy.c
lapacke_zgelsy_work.c
lapacke_zgemqr.c
lapacke_zgemqr_work.c
lapacke_zgemqrt.c
lapacke_zgemqrt_work.c
lapacke_zgeqlf.c
lapacke_zgeqlf_work.c
lapacke_zgeqp3.c
lapacke_zgeqp3_work.c
lapacke_zgeqpf.c
lapacke_zgeqpf_work.c
lapacke_zgeqr2.c
lapacke_zgeqr2_work.c
lapacke_zgeqrf.c
@ -1571,42 +1771,56 @@ set(ZSRC
lapacke_zgesv_work.c
lapacke_zgesvd.c
lapacke_zgesvd_work.c
lapacke_zgesvdx.c
lapacke_zgesvdx_work.c
lapacke_zgesvj.c
lapacke_zgesvj_work.c
lapacke_zgesvx.c
lapacke_zgesvx_work.c
lapacke_zgetf2.c
lapacke_zgetf2_work.c
lapacke_zgetrf.c
lapacke_zgetrf_work.c
lapacke_zgetrf2.c
lapacke_zgetrf2_work.c
lapacke_zgetri.c
lapacke_zgetri_work.c
lapacke_zgetrs.c
lapacke_zgetrs_work.c
lapacke_zgetsls.c
lapacke_zgetsls_work.c
lapacke_zggbak.c
lapacke_zggbak_work.c
lapacke_zggbal.c
lapacke_zggbal_work.c
lapacke_zgges.c
lapacke_zgges_work.c
lapacke_zgges3.c
lapacke_zgges3_work.c
lapacke_zggesx.c
lapacke_zggesx_work.c
lapacke_zggev.c
lapacke_zggev_work.c
lapacke_zggev3.c
lapacke_zggev3_work.c
lapacke_zggevx.c
lapacke_zggevx_work.c
lapacke_zggglm.c
lapacke_zggglm_work.c
lapacke_zgghrd.c
lapacke_zgghrd_work.c
lapacke_zgghd3.c
lapacke_zgghd3_work.c
lapacke_zgglse.c
lapacke_zgglse_work.c
lapacke_zggqrf.c
lapacke_zggqrf_work.c
lapacke_zggrqf.c
lapacke_zggrqf_work.c
lapacke_zggsvd.c
lapacke_zggsvd_work.c
lapacke_zggsvp.c
lapacke_zggsvp_work.c
lapacke_zggsvd3.c
lapacke_zggsvd3_work.c
lapacke_zggsvp3.c
lapacke_zggsvp3_work.c
lapacke_zgtcon.c
lapacke_zgtcon_work.c
lapacke_zgtrfs.c
@ -1637,6 +1851,8 @@ set(ZSRC
lapacke_zhbtrd_work.c
lapacke_zhecon.c
lapacke_zhecon_work.c
lapacke_zhecon_3.c
lapacke_zhecon_3_work.c
lapacke_zheequb.c
lapacke_zheequb_work.c
lapacke_zheev.c
@ -1647,10 +1863,20 @@ set(ZSRC
lapacke_zheevr_work.c
lapacke_zheevx.c
lapacke_zheevx_work.c
lapacke_zheev_2stage.c
lapacke_zheev_2stage_work.c
lapacke_zheevd_2stage.c
lapacke_zheevd_2stage_work.c
lapacke_zheevr_2stage.c
lapacke_zheevr_2stage_work.c
lapacke_zheevx_2stage.c
lapacke_zheevx_2stage_work.c
lapacke_zhegst.c
lapacke_zhegst_work.c
lapacke_zhegv.c
lapacke_zhegv_work.c
lapacke_zhegv_2stage.c
lapacke_zhegv_2stage_work.c
lapacke_zhegvd.c
lapacke_zhegvd_work.c
lapacke_zhegvx.c
@ -1659,6 +1885,10 @@ set(ZSRC
lapacke_zherfs_work.c
lapacke_zhesv.c
lapacke_zhesv_work.c
lapacke_zhesv_aa.c
lapacke_zhesv_aa_work.c
lapacke_zhesv_rk.c
lapacke_zhesv_rk_work.c
lapacke_zhesvx.c
lapacke_zhesvx_work.c
lapacke_zheswapr.c
@ -1666,17 +1896,31 @@ set(ZSRC
lapacke_zhetrd.c
lapacke_zhetrd_work.c
lapacke_zhetrf.c
lapacke_zhetrf_rook.c
lapacke_zhetrf_work.c
lapacke_zhetrf_rook_work.c
lapacke_zhetrf_aa.c
lapacke_zhetrf_aa_work.c
lapacke_zhetrf_rk.c
lapacke_zhetrf_rk_work.c
lapacke_zhetri.c
lapacke_zhetri2.c
lapacke_zhetri2_work.c
lapacke_zhetri_3.c
lapacke_zhetri_3_work.c
lapacke_zhetri2x.c
lapacke_zhetri2x_work.c
lapacke_zhetri_work.c
lapacke_zhetrs.c
lapacke_zhetrs_rook.c
lapacke_zhetrs2.c
lapacke_zhetrs2_work.c
lapacke_zhetrs_work.c
lapacke_zhetrs_aa.c
lapacke_zhetrs_aa_work.c
lapacke_zhetrs_3.c
lapacke_zhetrs_3_work.c
lapacke_zhetrs_rook_work.c
lapacke_zhfrk.c
lapacke_zhfrk_work.c
lapacke_zhgeqz.c
@ -1735,6 +1979,8 @@ set(ZSRC
lapacke_zlantr_work.c
lapacke_zlapmr.c
lapacke_zlapmr_work.c
lapacke_zlapmt.c
lapacke_zlapmt_work.c
lapacke_zlarfb.c
lapacke_zlarfb_work.c
lapacke_zlarfg.c
@ -1745,6 +1991,8 @@ set(ZSRC
lapacke_zlarfx_work.c
lapacke_zlarnv.c
lapacke_zlarnv_work.c
lapacke_zlascl.c
lapacke_zlascl_work.c
lapacke_zlaset.c
lapacke_zlaset_work.c
lapacke_zlaswp.c
@ -1787,6 +2035,8 @@ set(ZSRC
lapacke_zposvx_work.c
lapacke_zpotrf.c
lapacke_zpotrf_work.c
lapacke_zpotrf2.c
lapacke_zpotrf2_work.c
lapacke_zpotri.c
lapacke_zpotri_work.c
lapacke_zpotrs.c
@ -1849,6 +2099,8 @@ set(ZSRC
lapacke_zsteqr_work.c
lapacke_zsycon.c
lapacke_zsycon_work.c
lapacke_zsycon_3.c
lapacke_zsycon_3_work.c
lapacke_zsyconv.c
lapacke_zsyconv_work.c
lapacke_zsyequb.c
@ -1859,22 +2111,40 @@ set(ZSRC
lapacke_zsysv_rook.c
lapacke_zsysv_rook_work.c
lapacke_zsysv_work.c
lapacke_zsysv_aa.c
lapacke_zsysv_aa_work.c
lapacke_zsysv_rk.c
lapacke_zsysv_rk_work.c
lapacke_zsysvx.c
lapacke_zsysvx_work.c
lapacke_zsyswapr.c
lapacke_zsyswapr_work.c
lapacke_zsytrf.c
lapacke_zsytrf_work.c
lapacke_zsytrf_rook.c
lapacke_zsytrf_rook_work.c
lapacke_zsytrf_aa.c
lapacke_zsytrf_aa_work.c
lapacke_zsytrf_rk.c
lapacke_zsytrf_rk_work.c
lapacke_zsytri.c
lapacke_zsytri2.c
lapacke_zsytri2_work.c
lapacke_zsytri_3.c
lapacke_zsytri_3_work.c
lapacke_zsytri2x.c
lapacke_zsytri2x_work.c
lapacke_zsytri_work.c
lapacke_zsytrs.c
lapacke_zsytrs_rook.c
lapacke_zsytrs2.c
lapacke_zsytrs2_work.c
lapacke_zsytrs_work.c
lapacke_zsytrs_rook_work.c
lapacke_zsytrs_aa.c
lapacke_zsytrs_aa_work.c
lapacke_zsytrs_3.c
lapacke_zsytrs_3_work.c
lapacke_ztbcon.c
lapacke_ztbcon_work.c
lapacke_ztbrfs.c
@ -1949,6 +2219,8 @@ set(ZSRC
lapacke_zunbdb_work.c
lapacke_zuncsd.c
lapacke_zuncsd_work.c
lapacke_zuncsd2by1.c
lapacke_zuncsd2by1_work.c
lapacke_zungbr.c
lapacke_zungbr_work.c
lapacke_zunghr.c
@ -2119,6 +2391,6 @@ foreach (Utils_FILE ${Utils_SRC})
endforeach ()
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h")
include_directories(${lapacke_include_dir})
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")

9
cmake/openblas.pc.in Normal file
View File

@ -0,0 +1,9 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas
Cflags: -I${includedir}

View File

@ -77,7 +77,7 @@ if (CYGWIN)
set(NO_EXPRECISION 1)
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
if (SMP)
set(EXTRALIB "${EXTRALIB} -lpthread")
endif ()

View File

@ -4,7 +4,8 @@
## This is triggered by system.cmake and runs before any of the code is built.
## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files).
## Next it runs f_check and appends some fortran information to the files.
## Finally it runs getarch and getarch_2nd for even more environment information.
## Then it runs getarch and getarch_2nd for even more environment information.
## Finally it builds gen_config_h for use at build time to generate config.h.
# CMake vars set by this file:
# CORE
@ -71,16 +72,26 @@ if (MSVC)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
endif()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
# disable WindowsStore strict CRT checks
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
endif ()
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()
endif ()
message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -96,12 +107,18 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
if (NOT ${GETARCH2_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
endif ()
endif ()
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT)
@ -111,3 +128,21 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT})
ParseGetArchVars(${GETARCH2_MAKE_OUT})
# compile get_config_h
set(GEN_CONFIG_H_DIR "${PROJECT_BINARY_DIR}/genconfig_h_build")
set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
)
if (NOT ${GEN_CONFIG_H_RESULT})
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
endif ()
endif ()

View File

@ -22,7 +22,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER")
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
endif ()
@ -312,6 +312,8 @@ endif ()
set(AWK awk)
set(SED sed)
set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})

View File

@ -420,7 +420,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_arm64.h"
#endif
#ifdef ARCH_ZARCH
#include "common_zarch.h"
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@ -428,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
#endif
#endif
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
#ifdef _POSIX_MONOTONIC_CLOCK
@ -552,8 +561,13 @@ static void __inline blas_lock(volatile BLASULONG *address){
#endif
#if defined(C_PGI) || defined(C_SUN)
#define CREAL(X) (*((FLOAT *)&X + 0))
#define CIMAG(X) (*((FLOAT *)&X + 1))
#if defined(__STDC_IEC_559_COMPLEX__)
#define CREAL(X) creal(X)
#define CIMAG(X) cimag(X)
#else
#define CREAL(X) (*((FLOAT *)&X + 0))
#define CIMAG(X) (*((FLOAT *)&X + 1))
#endif
#else
#ifdef OPENBLAS_COMPLEX_STRUCT
#define CREAL(Z) ((Z).real)
@ -645,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
*address = 0;
}
#ifdef OS_WINDOWSSTORE
static __inline int readenv_atoi(char *env) {
return 0;
}
#else
#ifdef OS_WINDOWS
static __inline int readenv_atoi(char *env) {
env_var_t p;
@ -660,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
return(0);
}
#endif
#endif
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

View File

@ -105,7 +105,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:
#define EPILOGUE

View File

@ -39,7 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define INLINE inline
#ifdef F_INTERFACE_FLANG
#define RETURN_BY_STACK
#else
#define RETURN_BY_COMPLEX
#endif
#ifndef ASSEMBLER

View File

@ -70,7 +70,7 @@ extern long int syscall (long int __sysno, ...);
static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode,
unsigned flags) {
#if defined (__LSB_VERSION__)
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
@ -90,7 +90,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
}
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
#if defined (__LSB_VERSION__)
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;

View File

@ -2193,7 +2193,7 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sgemm_p;

View File

@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef COMMON_MIPS
#define COMMON_MIPS
#define MB
#define WMB
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define INLINE inline
@ -42,11 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
static void INLINE blas_lock(volatile unsigned long *address){
}
#define BLAS_LOCK_DEFINED
static inline unsigned int rpcc(void){
unsigned long ret;
@ -80,7 +75,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:
#define EPILOGUE

View File

@ -71,35 +71,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef COMMON_MIPS64
#define COMMON_MIPS64
#define MB
#define WMB
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static void INLINE blas_lock(volatile unsigned long *address){
long int ret, val = 1;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"1: ll %0, %3\n"
" ori %2, %0, 1\n"
" sc %2, %1\n"
" beqz %2, 1b\n"
" andi %2, %0, 1\n"
" sync\n"
: "=&r" (val), "=m" (address), "=&r" (ret)
: "m" (address)
: "memory");
} while (ret);
}
#define BLAS_LOCK_DEFINED
static inline unsigned int rpcc(void){
unsigned long ret;

View File

@ -245,6 +245,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define RETURN_BY_STACK
#endif
#ifdef F_INTERFACE_FLANG
#define RETURN_BY_STACK
#endif
#ifdef F_INTERFACE_PGI
#define RETURN_BY_STACK
#endif

140
common_zarch.h Normal file
View File

@ -0,0 +1,140 @@
/*****************************************************************************
Copyright (c) 2011-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#ifndef COMMON_ZARCH
#define COMMON_ZARCH
#define MB
//__asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB
//__asm__ __volatile__ ("dmb ishst" : : : "memory")
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
/*
static void __inline blas_lock(volatile BLASULONG *address){
BLASULONG ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"mov x4, #1 \n\t"
"1: \n\t"
"ldaxr x2, [%1] \n\t"
"cbnz x2, 1b \n\t"
"2: \n\t"
"stxr w3, x4, [%1] \n\t"
"cbnz w3, 1b \n\t"
"mov %0, #0 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "x2" , "x3", "x4"
);
} while (ret);
}
*/
//#define BLAS_LOCK_DEFINED
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 256 ;\
.global REALNAME ;\
.type REALNAME, %function ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
#define SEEK_ADDRESS
#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@ -114,6 +114,7 @@
#define CORE_HASWELL 24
#define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26
#define CORE_ZEN 27
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@ -209,5 +210,6 @@ typedef struct {
#define CPUTYPE_HASWELL 48
#define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51
#endif

View File

@ -74,7 +74,7 @@ int get_feature(char *search)
fclose(infile);
if( p == NULL ) return;
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))

View File

@ -30,17 +30,26 @@
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
#define CPU_CORTEXA57 2
#define CPU_VULCAN 3
#define CPU_THUNDERX 4
#define CPU_THUNDERX2T99 5
static char *cpuname[] = {
"UNKNOWN",
"ARMV8" ,
"CORTEXA57"
"CORTEXA57",
"VULCAN",
"THUNDERX",
"THUNDERX2T99"
};
static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"cortexa57"
"cortexa57",
"vulcan",
"thunderx",
"thunderx2t99"
};
int get_feature(char *search)
@ -85,25 +94,34 @@ int detect(void)
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *) NULL ;
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("CPU part", buffer, 8))
{
p = strchr(buffer, ':') + 2;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) {
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
break;
}
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
cpu_part = strchr(buffer, ':') + 2;
cpu_part = strdup(cpu_part);
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
cpu_implementer = strchr(buffer, ':') + 2;
cpu_implementer = strdup(cpu_implementer);
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "0xd07")) {
return CPU_CORTEXA57;
}
if(cpu_part != NULL && cpu_implementer != NULL) {
if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41"))
return CPU_CORTEXA57;
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
return CPU_THUNDERX2T99;
}
p = (char *) NULL ;
@ -176,6 +194,28 @@ void get_cpuconfig(void)
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_VULCAN:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
@ -191,8 +231,42 @@ void get_cpuconfig(void)
printf("#define L2_SIZE 2097152\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 16777216\n");
printf("#define L2_LINESIZE 128\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX2T99:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}

View File

@ -636,6 +636,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
LD1.associative = 8;
LD1.linesize = 64;
break;
case 0x63 :
DTB.size = 2048;
DTB.associative = 4;
DTB.linesize = 32;
LDTB.size = 4096;
LDTB.associative= 4;
LDTB.linesize = 32;
case 0x66 :
LD1.size = 8;
LD1.associative = 4;
@ -667,6 +674,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
LC1.size = 64;
LC1.associative = 8;
break;
case 0x76 :
ITB.size = 2048;
ITB.associative = 0;
ITB.linesize = 8;
LITB.size = 4096;
LITB.associative= 0;
LITB.linesize = 8;
case 0x77 :
LC1.size = 16;
LC1.associative = 4;
@ -1110,6 +1124,9 @@ int get_cpuname(void){
break;
case 3:
switch (model) {
case 7:
// Bay Trail
return CPUTYPE_ATOM;
case 10:
case 14:
// Ivy Bridge
@ -1199,6 +1216,33 @@ int get_cpuname(void){
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 7:
// Xeon Phi Knights Landing
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {
case 14: // Kaby Lake
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
@ -1235,8 +1279,11 @@ int get_cpuname(void){
return CPUTYPE_OPTERON;
case 1:
case 3:
case 7:
case 10:
return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCAT;
case 6:
switch (model) {
case 1:
@ -1251,7 +1298,13 @@ int get_cpuname(void){
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 5: // New EXCAVATOR CPUS
if(support_avx())
return CPUTYPE_EXCAVATOR;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 0:
case 8:
switch(exmodel){
case 1: //AMD Trinity
if(support_avx())
@ -1273,8 +1326,19 @@ int get_cpuname(void){
break;
}
break;
case 5:
return CPUTYPE_BOBCAT;
case 8:
switch (model) {
case 1:
// AMD Ryzen
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
}
}
break;
}
@ -1401,6 +1465,7 @@ static char *cpuname[] = {
"HASWELL",
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
};
static char *lowercpuname[] = {
@ -1454,6 +1519,7 @@ static char *lowercpuname[] = {
"haswell",
"steamroller",
"excavator",
"zen",
};
static char *corename[] = {
@ -1484,6 +1550,7 @@ static char *corename[] = {
"HASWELL",
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
};
static char *corename_lower[] = {
@ -1514,6 +1581,7 @@ static char *corename_lower[] = {
"haswell",
"steamroller",
"excavator",
"zen",
};
@ -1710,8 +1778,33 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
}
case 7:
// Phi Knights Landing
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 12:
// Apollo Lake
return CORE_NEHALEM;
}
break;
case 9:
case 8:
if (model == 14) { // Kaby Lake
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
}
break;
@ -1741,8 +1834,13 @@ int get_coretype(void){
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 5: // New EXCAVATOR
if(support_avx())
return CORE_EXCAVATOR;
else
return CORE_BARCELONA; //OS don't support AVX.
case 0:
case 8:
switch(exmodel){
case 1: //AMD Trinity
if(support_avx())
@ -1764,9 +1862,22 @@ int get_coretype(void){
}
break;
}
}else return CORE_BARCELONA;
} else if (exfamily == 8) {
switch (model) {
case 1:
// AMD Ryzen
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
#else
return CORE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CORE_BARCELONA;
}
} else {
return CORE_BARCELONA;
}
}
}

111
cpuid_zarch.c Normal file
View File

@ -0,0 +1,111 @@
/**************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13"
};
int detect(void)
{
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Type", buffer, 4)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
return CPU_GENERIC;
}
void get_libname(void)
{
int d = detect();
printf("%s", cpuname_lower[d]);
}
char *get_corename(void)
{
return cpuname[detect()];
}
void get_architecture(void)
{
printf("ZARCH");
}
void get_subarchitecture(void)
{
int d = detect();
printf("%s", cpuname[d]);
}
void get_subdirname(void)
{
printf("zarch");
}
void get_cpuconfig(void)
{
int d = detect();
switch (d){
case CPU_GENERIC:
printf("#define ZARCH_GENERIC\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z13:
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
}
}

View File

@ -105,6 +105,10 @@ ARCH_X86_64
ARCH_POWER
#endif
#if defined(__s390x__) || defined(__zarch__)
ARCH_ZARCH
#endif
#ifdef __mips64
ARCH_MIPS64
#endif

View File

@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#endif
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
blas_queue_t queue[MAX_CPU_NUMBER + 1];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];

View File

@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -316,7 +316,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_l > GEMM3M_Q) {
min_l = (min_l + 1) / 2;
#ifdef UNROLL_X
min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X;
#endif
}
}
@ -326,7 +326,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@ -365,7 +365,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@ -386,7 +386,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@ -429,7 +429,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@ -451,7 +451,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@ -494,7 +494,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();

View File

@ -297,9 +297,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_l = GEMM_Q;
} else {
if (min_l > GEMM_Q) {
min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
}
@ -311,7 +311,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else {
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
} else {
l1stride = 0;
}
@ -369,7 +369,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();

View File

@ -365,7 +365,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
}
for(ls = 0; ls < k; ls += min_l){
@ -384,7 +384,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@ -482,7 +482,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@ -618,7 +618,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@ -754,7 +754,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();

View File

@ -189,7 +189,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
@ -230,7 +230,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
@ -245,7 +245,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
if (m_start >= js) {
@ -284,7 +284,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
@ -322,7 +322,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (is - js) * COMPSIZE;
@ -353,7 +353,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (m_start - js) * COMPSIZE;
@ -383,7 +383,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (is - js) * COMPSIZE;

View File

@ -198,7 +198,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
@ -239,7 +239,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (is - js) * COMPSIZE;
@ -303,7 +303,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
START_RPCC();
@ -375,7 +375,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
if (is < js + min_j) {
@ -460,7 +460,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
START_RPCC();

View File

@ -210,8 +210,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to);
#endif
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
@ -233,7 +232,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else {
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
}
@ -253,8 +252,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
STOP_RPCC(copy_A);
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
@ -353,8 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
while (current >= 0) {
#endif
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
@ -412,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
START_RPCC();
@ -425,8 +422,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
do {
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
@ -602,9 +598,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
double di = (double)i;
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) );
if (num_cpu == 0) width = n - ((n - width) & ~mask);
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) );
if ((width > n - i) || (width < mask)) width = n - i;
@ -644,7 +640,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
double di = (double)i;
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;

View File

@ -310,7 +310,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
}
@ -331,7 +331,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else {
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
} else {
if (args -> nthreads == 1) l1stride = 0;
}
@ -443,7 +443,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();

View File

@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
int mm, nn;
mm = (loop & ~(GEMM_UNROLL_MN - 1));
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
nn = MIN(GEMM_UNROLL_MN, n - loop);
#ifndef LOWER

View File

@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)i;
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i);
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;

View File

@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
int mm, nn;
mm = (loop & ~(GEMM_UNROLL_MN - 1));
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
nn = MIN(GEMM_UNROLL_MN, n - loop);
#ifndef LOWER

View File

@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
int mm, nn;
mm = (loop & ~(GEMM_UNROLL_MN - 1));
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
nn = MIN(GEMM_UNROLL_MN, n - loop);
#ifndef LOWER

View File

@ -12,6 +12,8 @@ if (SMP)
set(BLAS_SERVER blas_server_omp.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(BLAS_SERVER blas_server_win32.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
set(BLAS_SERVER blas_server_win32.c)
endif ()
if (NOT DEFINED BLAS_SERVER)

View File

@ -110,3 +110,74 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
return 0;
}
int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
void *a, BLASLONG lda,
void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads){
blas_queue_t queue[MAX_CPU_NUMBER];
blas_arg_t args [MAX_CPU_NUMBER];
BLASLONG i, width, astride, bstride;
int num_cpu, calc_type;
calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2;
mode |= BLAS_LEGACY;
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
num_cpu = 0;
i = m;
while (i > 0){
/* Adjust Parameters */
width = blas_quickdivide(i + nthreads - num_cpu - 1,
nthreads - num_cpu);
i -= width;
if (i < 0) width = width + i;
astride = width * lda;
if (!(mode & BLAS_TRANSB_T)) {
bstride = width * ldb;
} else {
bstride = width;
}
astride <<= calc_type;
bstride <<= calc_type;
args[num_cpu].m = width;
args[num_cpu].n = n;
args[num_cpu].k = k;
args[num_cpu].a = (void *)a;
args[num_cpu].b = (void *)b;
args[num_cpu].c = (void *)((char *)c + num_cpu * sizeof(double)*2);
args[num_cpu].lda = lda;
args[num_cpu].ldb = ldb;
args[num_cpu].ldc = ldc;
args[num_cpu].alpha = alpha;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = function;
queue[num_cpu].args = &args[num_cpu];
queue[num_cpu].next = &queue[num_cpu + 1];
a = (void *)((BLASULONG)a + astride);
b = (void *)((BLASULONG)b + bstride);
num_cpu ++;
}
if (num_cpu) {
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
}
return 0;
}

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>
@ -276,6 +276,9 @@ static void* blas_thread_server(void *arg){
unsigned int last_tick;
void *buffer, *sa, *sb;
blas_queue_t *queue;
blas_queue_t *tscq;
#ifdef TIMING_DEBUG
unsigned long start, stop;
#endif
@ -309,8 +312,11 @@ static void* blas_thread_server(void *arg){
last_tick = (unsigned int)rpcc();
while (!thread_status[cpu].queue) {
pthread_mutex_lock (&thread_status[cpu].lock);
tscq=thread_status[cpu].queue;
pthread_mutex_unlock (&thread_status[cpu].lock);
while(!tscq) {
YIELDING;
if ((unsigned int)rpcc() - last_tick > thread_timeout) {
@ -333,6 +339,9 @@ static void* blas_thread_server(void *arg){
last_tick = (unsigned int)rpcc();
}
pthread_mutex_lock (&thread_status[cpu].lock);
tscq=thread_status[cpu].queue;
pthread_mutex_unlock (&thread_status[cpu].lock);
}
@ -351,7 +360,9 @@ static void* blas_thread_server(void *arg){
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
pthread_mutex_lock (&thread_status[cpu].lock);
thread_status[cpu].queue = (blas_queue_t *)1;
pthread_mutex_unlock (&thread_status[cpu].lock);
sa = queue -> sa;
sb = queue -> sb;
@ -433,7 +444,10 @@ static void* blas_thread_server(void *arg){
// thread is marked as done and other threads use them
WMB;
pthread_mutex_lock (&thread_status[cpu].lock);
thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
pthread_mutex_unlock (&thread_status[cpu].lock);
WMB;
}
@ -613,6 +627,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#endif
BLASLONG i = 0;
blas_queue_t *current = queue;
blas_queue_t *tsiq,*tspq;
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
int node = get_node();
int nodes = get_num_nodes();
@ -660,15 +675,23 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
}
}
#else
while(thread_status[i].queue) {
pthread_mutex_lock (&thread_status[i].lock);
tsiq=thread_status[i].queue ;
pthread_mutex_unlock (&thread_status[i].lock);
while(tsiq) {
i ++;
if (i >= blas_num_threads - 1) i = 0;
pthread_mutex_lock (&thread_status[i].lock);
tsiq=thread_status[i].queue ;
pthread_mutex_unlock (&thread_status[i].lock);
}
#endif
queue -> assigned = i;
WMB;
pthread_mutex_lock (&thread_status[i].lock);
thread_status[i].queue = queue;
pthread_mutex_unlock (&thread_status[i].lock);
WMB;
queue = queue -> next;
@ -689,11 +712,15 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
pos = current -> assigned;
if ((BLASULONG)thread_status[pos].queue > 1) {
pthread_mutex_lock (&thread_status[pos].lock);
tspq=thread_status[pos].queue;
pthread_mutex_unlock (&thread_status[pos].lock);
if ((BLASULONG)tspq > 1) {
pthread_mutex_lock (&thread_status[pos].lock);
if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
pthread_mutex_lock (&thread_status[pos].lock);
#ifdef MONITOR
num_suspend ++;
@ -703,8 +730,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
thread_status[pos].status = THREAD_STATUS_WAKEUP;
pthread_cond_signal(&thread_status[pos].wakeup);
}
pthread_mutex_unlock(&thread_status[pos].lock);
}
pthread_mutex_unlock(&thread_status[pos].lock);
}
current = current -> next;
@ -714,11 +742,22 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
}
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
blas_queue_t * tsqq;
while ((num > 0) && queue) {
while(thread_status[queue -> assigned].queue) {
pthread_mutex_lock(&thread_status[queue->assigned].lock);
tsqq=thread_status[queue -> assigned].queue;
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
while(tsqq) {
YIELDING;
pthread_mutex_lock(&thread_status[queue->assigned].lock);
tsqq=thread_status[queue -> assigned].queue;
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
};
queue = queue -> next;

View File

@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
TerminateThread(blas_threads[i],0);
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
}
blas_server_avail = 0;

View File

@ -70,8 +70,10 @@ extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#endif
#else
//Use NEHALEM kernels for sandy bridge
@ -81,6 +83,7 @@ extern gotoblas_t gotoblas_HASWELL;
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
#define gotoblas_ZEN gotoblas_BARCELONA
#endif
@ -232,6 +235,7 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 7) return &gotoblas_ATOM; //Bay Trail
return NULL;
case 4:
//Intel Haswell
@ -263,7 +267,6 @@ static gotoblas_t *get_coretype(void){
}
//Intel Braswell / Avoton
if (model == 12 || model == 13) {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
return NULL;
@ -286,6 +289,30 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake
if (model == 12) {
return &gotoblas_NEHALEM;
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
@ -331,7 +358,14 @@ static gotoblas_t *get_coretype(void){
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0){
}else if(model == 5){
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0 || model == 8){
if (exmodel == 1) {
//AMD Trinity
if(support_avx())
@ -358,9 +392,16 @@ static gotoblas_t *get_coretype(void){
}
}
} else {
} else if (exfamily == 8) {
if (model == 1) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}else {
return &gotoblas_BARCELONA;
}
}
@ -370,7 +411,6 @@ static gotoblas_t *get_coretype(void){
switch (family) {
case 0x6:
return &gotoblas_NANO;
break;
}
}
@ -401,6 +441,7 @@ static char *corename[] = {
"Haswell",
"Steamroller",
"Excavator",
"Zen"
};
char *gotoblas_corename(void) {
@ -427,6 +468,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
return corename[0];
}
@ -439,7 +481,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
for ( i=1 ; i <= 22; i++)
for ( i=1 ; i <= 23; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
@ -457,6 +499,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found)
{
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);

View File

@ -354,6 +354,24 @@ static int numa_check(void) {
return common -> num_nodes;
}
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 6)
int sched_getcpu(void)
{
int cpu;
FILE *fp = NULL;
if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
return -1;
if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
fclose (fp);
return -1;
}
fclose (fp);
return(cpu);
}
#endif
#endif
static void numa_mapping(void) {
int node, cpu, core;
@ -808,16 +826,54 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
cpu_set_t *cpusetp;
int nums;
int ret;
#ifdef DEBUG
fprintf(stderr, "Shared Memory Initialization.\n");
#endif
//returns the number of processors which are currently online
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;
nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 3)
common->num_procs = nums;
#elif __GLIBC_PREREQ(2, 7)
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
common->num_procs = nums;
} else {
size_t size;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0)
common->num_procs = nums;
else
common->num_procs = CPU_COUNT_S(size,cpusetp);
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) {
common->num_procs = nums;
} else {
#if !__GLIBC_PREREQ(2, 6)
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
#endif
#endif
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
exit(1);
}
@ -923,7 +979,7 @@ void gotoblas_set_affinity2(int threads) {};
void gotoblas_affinity_reschedule(void) {};
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_ONLN); }
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); }
int get_num_nodes(void) { return 1; }

View File

@ -169,13 +169,50 @@ void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS)
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
#endif
#if !__GLIBC_PREREQ(2, 3)
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
#endif
return nums;
#endif
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp);
return nums;
}
#endif
@ -184,7 +221,7 @@ int get_num_procs(void) {
#ifdef OS_ANDROID
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
@ -381,6 +418,16 @@ static int release_pos = 0;
static int hot_alloc = 0;
#endif
/* Global lock for memory allocation */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
#else
static BLASULONG alloc_lock = 0UL;
#endif
#ifdef ALLOC_MMAP
static void alloc_mmap_free(struct release_t *release){
@ -390,6 +437,8 @@ static void alloc_mmap_free(struct release_t *release){
}
}
#ifdef NO_WARMUP
static void *alloc_mmap(void *address){
@ -406,9 +455,11 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
LOCK_COMMAND(&alloc_lock);
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
UNLOCK_COMMAND(&alloc_lock);
}
#ifdef OS_LINUX
@ -550,12 +601,14 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
}
UNLOCK_COMMAND(&alloc_lock);
return map_address;
}
@ -889,15 +942,6 @@ static void *alloc_hugetlbfile(void *address){
}
#endif
/* Global lock for memory allocation */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
#else
static BLASULONG alloc_lock = 0UL;
#endif
#ifdef SEEK_ADDRESS
static BLASULONG base_address = 0UL;
@ -963,45 +1007,41 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
#endif
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
gotoblas_dynamic_init();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
gotoblas_affinity_init();
#endif
#ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
blas_set_parameter();
#endif
#endif
memory_initialized = 1;
}
memory_initialized = 1;
UNLOCK_COMMAND(&alloc_lock);
}
UNLOCK_COMMAND(&alloc_lock);
#ifdef DEBUG
printf("Alloc Start ...\n");
@ -1012,7 +1052,7 @@ void *blas_memory_alloc(int procpos){
mypos = WhereAmI();
position = mypos;
while (position > NUM_BUFFERS) position >>= 1;
while (position >= NUM_BUFFERS) position >>= 1;
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
@ -1034,14 +1074,14 @@ void *blas_memory_alloc(int procpos){
position = 0;
do {
if (!memory[position].used) {
/* if (!memory[position].used) { */
blas_lock(&memory[position].lock);
if (!memory[position].used) goto allocation;
blas_unlock(&memory[position].lock);
}
/* } */
position ++;
@ -1103,7 +1143,9 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
LOCK_COMMAND(&alloc_lock);
memory[position].addr = map_address;
UNLOCK_COMMAND(&alloc_lock);
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@ -1157,9 +1199,10 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
LOCK_COMMAND(&alloc_lock);
while ((memory[position].addr != free_area)
&& (position < NUM_BUFFERS)) position++;
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
@ -1171,6 +1214,7 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
UNLOCK_COMMAND(&alloc_lock);
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@ -1185,6 +1229,7 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
UNLOCK_COMMAND(&alloc_lock);
return;
}
@ -1471,12 +1516,30 @@ static int on_process_term(void)
#else
#pragma comment(linker, "/INCLUDE:__tls_used")
#endif
#pragma data_seg(push, old_seg)
#ifdef _WIN64
#pragma const_seg(".CRT$XLB")
#else
#pragma data_seg(".CRT$XLB")
#endif
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#ifdef _WIN64
#pragma const_seg()
#else
#pragma data_seg()
#endif
#ifdef _WIN64
#pragma const_seg(".CRT$XTU")
#else
#pragma data_seg(".CRT$XTU")
#endif
static int(*p_process_term)(void) = on_process_term;
#pragma data_seg(pop, old_seg)
#ifdef _WIN64
#pragma const_seg()
#else
#pragma data_seg()
#endif
#endif
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
int size = 16;
#else
int size = get_L2_size();
@ -497,13 +497,13 @@ void blas_set_parameter(void){
if (xgemm_p == 0) xgemm_p = 64;
#endif
sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1);
dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1);
cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1);
zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1);
sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
#ifdef QUAD_PRECISION
qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1);
xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1);
qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
#endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
@ -727,3 +727,38 @@ void blas_set_parameter(void){
}
#endif
#if defined(ARCH_ARM64)
#if defined(VULCAN) || defined(THUNDERX2T99)
unsigned long dgemm_prefetch_size_a;
unsigned long dgemm_prefetch_size_b;
unsigned long dgemm_prefetch_size_c;
#endif
void blas_set_parameter(void)
{
#if defined(VULCAN) || defined(THUNDERX2T99)
dgemm_p = 160;
dgemm_q = 128;
dgemm_r = 4096;
sgemm_p = 128;
sgemm_q = 352;
sgemm_r = 4096;
cgemm_p = 128;
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;
#endif
}
#endif

View File

@ -46,10 +46,16 @@
#define printf _cprintf
#endif
#ifdef INTERFACE64
#define MSGFMT " ** On entry to %6s parameter number %2ld had an illegal value\n"
#else
#define MSGFMT " ** On entry to %6s parameter number %2d had an illegal value\n"
#endif
#ifdef __ELF__
int __xerbla(char *message, blasint *info, blasint length){
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
printf(MSGFMT,
message, *info);
return 0;
@ -61,7 +67,7 @@ int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("_
int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
printf(MSGFMT,
message, *info);
return 0;

View File

@ -118,10 +118,16 @@ endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
so : ../$(LIBSONAME)
ifeq ($(OSNAME), Android)
INTERNALNAME = $(LIBPREFIX).so
else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
else
@ -132,13 +138,13 @@ endif
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
#for LSB
env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
rm -f linktest

61
exports/check_objs.sh Executable file
View File

@ -0,0 +1,61 @@
#!/bin/bash
while read OBJ; do
if echo "$OBJ"|grep "_$" >/dev/null
then
[ "$OBJ" = "caxpyc_" ] && continue
[ "$OBJ" = "zaxpyc_" ] && continue
[ "$OBJ" = "blas_thread_shutdown_" ] && continue
O1=$(echo "$OBJ"|sed -e 's/_$//' )
if grep -w "$O1" exports/gensymbol >/dev/null
then
true
else
echo "$O1"
fi
continue
fi
if echo "$OBJ"|grep "^cblas" >/dev/null
then
if grep -w "$OBJ" exports/gensymbol >/dev/null
then
true
else
echo "$OBJ"
fi
continue
fi
if echo "$OBJ"|grep "^LAPACKE" >/dev/null
then
if grep -w "$OBJ" exports/gensymbol >/dev/null
then
true
else
echo "$OBJ"
fi
continue
fi
if echo "$OBJ"|grep "^lapack" >/dev/null
then
if grep -w "$OBJ" exports/gensymbol >/dev/null
then
true
else
echo "$OBJ"
fi
fi
done

File diff suppressed because it is too large Load Diff

26
f_check
View File

@ -33,6 +33,7 @@ if ($compiler eq "") {
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95",
"pgf95", "pgf90", "pgf77",
"flang",
"ifort");
OUTER:
@ -78,8 +79,13 @@ if ($compiler eq "") {
$vendor = GFORTRAN;
$openmp = "-fopenmp";
} else {
$vendor = G77;
$openmp = "";
if ($compiler =~ /flang/) {
$vendor = FLANG;
$openmp = "-fopenmp";
} else {
$vendor = G77;
$openmp = "";
}
}
}
@ -197,6 +203,12 @@ if ($compiler eq "") {
$openmp = "-mp";
}
if ($compiler =~ /flang/) {
$vendor = FLANG;
$bu = "_";
$openmp = "-fopenmp";
}
if ($vendor eq "") {
$nofortran = 1;
$compiler = "gfortran";
@ -283,6 +295,12 @@ if ($link ne "") {
$linker_L .= "-Wl,". $flags . " ";
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
@ -325,6 +343,10 @@ if ($vendor eq "INTEL"){
$linker_a .= "-lgfortran"
}
if ($vendor eq "FLANG"){
$linker_a .= "-lflang"
}
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
open(CONFFILE, ">> $config" ) || die "Can't append $config";

36
gen_config_h.c Normal file
View File

@ -0,0 +1,36 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char**argv) {
FILE *fp;
char line[100];
char line2[80];
char *s;
int i;
fprintf(stdout,"#ifndef OPENBLAS_CONFIG_H\n");
fprintf(stdout,"#define OPENBLAS_CONFIG_H\n");
fp=fopen(argv[1],"r");
do{
s=fgets(line,80,fp);
if (s== NULL) break;
memset(line2,0,80);
i=sscanf(line,"#define %70c",line2);
if (i!=0) {
fprintf(stdout,"#define OPENBLAS_%s",line2);
} else {
fprintf(stdout,"\n");
}
} while (1);
fclose(fp);
fprintf(stdout,"#define OPENBLAS_VERSION \"OpenBLAS %s\"\n", VERSION);
fp=fopen(argv[2],"r");
do{
s=fgets(line,100,fp);
if (s== NULL) break;
fprintf(stdout,"%s",line);
} while(1);
fclose(fp);
fprintf(stdout,"#endif /* OPENBLAS_CONFIG_H */\n");
exit(0);
}

View File

@ -473,6 +473,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "EXCAVATOR"
#endif
#if defined (FORCE_ZEN)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "ZEN"
#define ARCHCONFIG "-DZEN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL2_CODE_ASSOCIATIVE=8 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=16777216 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=8 " \
"-DITB_DEFAULT_ENTRIES=64 -DITB_SIZE=4096 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
#define LIBNAME "zen"
#define CORENAME "ZEN"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE
@ -884,7 +903,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8"
#define SUBARCHITECTURE "CORTEXA57"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA57 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
@ -897,6 +916,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_VULCAN
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VULCAN"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVULCAN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "vulcan"
#define CORENAME "VULCAN"
#else
#endif
#ifdef FORCE_THUNDERX
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTHUNDERX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX2T99"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTHUNDERX2T99 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
#endif
#ifndef FORCE
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
@ -907,6 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
#if defined(__zarch__) || defined(__s390x__)
#define ZARCH
#include "cpuid_zarch.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef INTEL_AMD
#include "cpuid_x86.c"
#define OPENBLAS_SUPPORTED
@ -971,7 +1044,7 @@ static int get_num_cores(void) {
#if defined(linux) || defined(__sun__)
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
return sysconf(_SC_NPROCESSORS_CONF);
#elif defined(OS_WINDOWS)
@ -1006,7 +1079,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
printf("CORE=%s\n", get_corename());
#endif
#endif
@ -1098,6 +1171,7 @@ int main(int argc, char *argv[]){
p ++;
}
} else {
if (*p != '\n')
printf("%c", *p);
p ++;
}
@ -1113,7 +1187,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

View File

@ -84,10 +84,10 @@ CBLAS1OBJS = \
CBLAS2OBJS = \
cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \
ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) csymv.$(SUFFIX) \
csyr.$(SUFFIX) csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
csbmv.$(SUFFIX) cspmv.$(SUFFIX) \
cspr.$(SUFFIX) cspr2.$(SUFFIX) \
ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \
csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
csbmv.$(SUFFIX) \
cspr2.$(SUFFIX) \
ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \
ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \
chemv.$(SUFFIX) chbmv.$(SUFFIX) \
@ -113,10 +113,10 @@ ZBLAS1OBJS = \
ZBLAS2OBJS = \
zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \
ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) zsymv.$(SUFFIX) \
zsyr.$(SUFFIX) zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \
zspr.$(SUFFIX) zspr2.$(SUFFIX) \
ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \
zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
zsbmv.$(SUFFIX) \
zspr2.$(SUFFIX) \
ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \
ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \
zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \
@ -315,7 +315,7 @@ CCBLAS3OBJS = \
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
cblas_cgeadd.$(SUFFIX)
cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX)
@ -2137,3 +2137,5 @@ cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)

View File

@ -42,9 +42,13 @@
#include "functable.h"
#endif
#if defined(THUNDERX2T99) || defined(VULCAN)
// Multithreaded swap gives performance benefits in ThunderX2T99
#else
// Disable multi-threading as it does not show any performance
// benefits. Keep the multi-threading code for the record.
#undef SMP
#endif
#ifndef CBLAS
@ -81,7 +85,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))

22
interface/xerbla.c Normal file
View File

@ -0,0 +1,22 @@
#ifdef CBLAS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include "common.h"
void CNAME(blasint p, char *rout, char *form, ...)
{
va_list args;
va_start(args, form);
if (p)
fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
vfprintf(stderr, form, args);
va_end(args);
exit(-1);
}
#endif

View File

@ -160,9 +160,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasin
if (n <= 0) {
#ifdef FORCE_USE_STACK
//*result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
CREAL(*result) = 0.0;
CIMAG(*result) = 0.0;
OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
*result = zero;
// CREAL(*result) = 0.0;
// CIMAG(*result) = 0.0;
return;
#else
return zero;

View File

@ -125,9 +125,8 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#ifdef NEW_IMATCOPY
if (*lda == *ldb) {
if (*lda == *ldb && *cols == *rows) {
if ( order == BlasColMajor )
{
@ -180,7 +179,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed\n");
printf("Memory alloc failed in zimatcopy\n");
exit(1);
}
@ -205,14 +204,14 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasTrans )
{
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
}
if ( trans == BlasTransConj )
{
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
}
@ -238,20 +237,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasTrans )
{
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
}
if ( trans == BlasTransConj )
{
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
}
}
free(b);
return;
}

View File

@ -118,7 +118,7 @@ endforeach ()
# Makefile.L3
set(USE_TRMM false)
if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "HASWELL")
if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "{CORE}" STREQUAL "zen")
set(USE_TRMM true)
endif ()

View File

@ -32,10 +32,18 @@ ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif
ifeq ($(CORE), ZEN)
USE_TRMM = 1
endif
ifeq ($(CORE), POWER8)
USE_TRMM = 1
endif
ifeq ($(CORE), Z13)
USE_TRMM = 1
endif

View File

@ -1,7 +1,5 @@
include $(KERNELDIR)/KERNEL.ARMV5
###############################################################################
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
CNRM2KERNEL = nrm2_vfp.S
ZNRM2KERNEL = nrm2_vfp.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal.c
DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
endif
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
endif
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S

View File

@ -1,91 +1,12 @@
#################################################################################
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S
SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S
include $(KERNELDIR)/KERNEL.ARMV6
SNRM2KERNEL = nrm2_vfpv3.S
DNRM2KERNEL = nrm2_vfpv3.S
CNRM2KERNEL = nrm2_vfpv3.S
ZNRM2KERNEL = nrm2_vfpv3.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal.c
DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SGEMVNKERNEL = gemv_n_vfpv3.S
DGEMVNKERNEL = gemv_n_vfpv3.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S

View File

@ -475,6 +475,14 @@ asum_kernel_L999:
vadd.f32 s0 , s0, s1 // set return value
#endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif
bx lr
EPILOGUE

View File

@ -38,10 +38,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_X [fp, #0 ]
#define OLD_INC_X [fp, #4 ]
#define OLD_Y [fp, #8 ]
#define OLD_INC_Y [fp, #12 ]
#else
#define OLD_ALPHA [fp, #0]
#define OLD_X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define OLD_Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#endif
#else //COMPLEX
#if !defined(DOUBLE)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define OLD_Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#else
#define OLD_ALPHAR [fp, #0]
#define OLD_ALPHAI [fp, #8]
#define OLD_X [fp, #16 ]
#define OLD_INC_X [fp, #20 ]
#define OLD_Y [fp, #24 ]
#define OLD_INC_Y [fp, #28 ]
#endif
#endif //!defined(COMPLEX)
#else //__ARM_PCS_VFP
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#endif //!defined(__ARM_PCS_VFP)
#define N r0
#define Y r1
@ -64,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
@ -83,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -363,6 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
#if !defined(COMPLEX)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
ldr X, OLD_X
#else
vldr d0, OLD_ALPHA
ldr X, OLD_X
#endif
#else //COMPLEX
#if !defined(DOUBLE)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr X, OLD_X
#else
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr X, OLD_X
#endif
#endif
#endif
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y

View File

@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_RETURN_ADDR r0
#define OLD_N r1
#define OLD_X r2
#define OLD_INC_X r3
#define OLD_Y [fp, #0 ]
#define OLD_INC_Y [fp, #4 ]
#define RETURN_ADDR r8
#else
#define OLD_Y r3
#define OLD_INC_Y [fp, #0 ]
#endif
#define I r5
#define Y r6
@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
push {r4 - r9, fp}
add fp, sp, #24
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmov s2, s0
vmov s3, s0
#if !defined(__ARM_PCS_VFP)
mov RETURN_ADDR, OLD_RETURN_ADDR
mov N, OLD_N
mov X, OLD_X
mov INC_X, OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#else
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#endif
cmp N, #0
ble cdot_kernel_L999
@ -265,7 +283,6 @@ cdot_kernel_S10:
cdot_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
@ -276,8 +293,11 @@ cdot_kernel_L999:
vadd.f32 s0 , s0, s2
vsub.f32 s1 , s1, s3
#endif
#if !defined(__ARM_PCS_VFP)
vstm RETURN_ADDR, {s0 - s1}
#endif
sub sp, fp, #24
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

View File

@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CN) || defined(CT)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmacs
#define FMAC_R2 fnmacs
#define FMAC_R1 vmls.f32
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fnmacs
#define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT)
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmacs
#define FMAC_R1 vmls.f32
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 vmls.f32
#endif
@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_A_SOFTFP [fp, #4 ]
#define OLD_LDA [fp, #8 ]
#define X [fp, #12 ]
#define OLD_INC_X [fp, #16 ]
#define Y [fp, #20 ]
#define OLD_INC_Y [fp, #24 ]
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_M r0
@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble cgemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_M, M
vstr s0 , ALPHA_R

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_A_SOFTFP [fp, #4 ]
#define OLD_LDA [fp, #8 ]
#define X [fp, #12 ]
#define OLD_INC_X [fp, #16 ]
#define Y [fp, #20 ]
#define OLD_INC_Y [fp, #24 ]
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_N r1
@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble cgemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_N, N

View File

@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#define OFFSET [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CN) || defined(CT)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#define OFFSET [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmuls
#define FMAC_R2 fnmacs
#define FMAC_R1 vnmul.f32
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls
#define FMAC_I2 fnmacs
#define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT)
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuls
#define FMAC_R2 fmacs
#define FMAC_I1 fnmuls
#define FMAC_I1 vnmul.f32
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs
#define FMAC_R1 fmuls
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls
#define FMAC_I2 fmacs
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmuls
#define FMAC_R1 vnmul.f32
#define FMAC_R2 fmacs
#define FMAC_I1 fnmuls
#define FMAC_I2 fnmacs
#define FMAC_I1 vnmul.f32
#define FMAC_I2 vmls.f32
#endif
@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -246,6 +246,9 @@ ddot_kernel_L999:
vldm r3, { d8 - d15} // restore floating point registers
vadd.f64 d0 , d0, d1 // set return value
#if !defined(__ARM_PCS_VFP)
vmov r0, r1, d0
#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

View File

@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define OLD_C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#define OFFSET [fp, #28 ]
#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define OLD_C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#define OFFSET [fp, #28 ]
#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

Some files were not shown because too many files have changed in this diff Show More