commit
5dde4e65d3
|
@ -14,6 +14,21 @@ lapack-3.4.2.tgz
|
|||
lapack-netlib/make.inc
|
||||
lapack-netlib/lapacke/include/lapacke_mangling.h
|
||||
lapack-netlib/TESTING/testing_results.txt
|
||||
lapack-netlib/INSTALL/test*
|
||||
lapack-netlib/TESTING/xeigtstc
|
||||
lapack-netlib/TESTING/xeigtstd
|
||||
lapack-netlib/TESTING/xeigtsts
|
||||
lapack-netlib/TESTING/xeigtstz
|
||||
lapack-netlib/TESTING/xlintstc
|
||||
lapack-netlib/TESTING/xlintstd
|
||||
lapack-netlib/TESTING/xlintstds
|
||||
lapack-netlib/TESTING/xlintstrfc
|
||||
lapack-netlib/TESTING/xlintstrfd
|
||||
lapack-netlib/TESTING/xlintstrfs
|
||||
lapack-netlib/TESTING/xlintstrfz
|
||||
lapack-netlib/TESTING/xlintsts
|
||||
lapack-netlib/TESTING/xlintstz
|
||||
lapack-netlib/TESTING/xlintstzc
|
||||
*.so
|
||||
*.so.*
|
||||
*.a
|
||||
|
@ -69,3 +84,6 @@ test/zblat3
|
|||
build
|
||||
build.*
|
||||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
|
||||
|
|
107
CMakeLists.txt
107
CMakeLists.txt
|
@ -2,16 +2,19 @@
|
|||
## Author: Hank Anderson <hank@statease.com>
|
||||
##
|
||||
|
||||
cmake_minimum_required(VERSION 2.8.4)
|
||||
cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 19)
|
||||
set(OpenBLAS_PATCH_VERSION 20)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
enable_language(C)
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
include(GNUInstallDirs)
|
||||
|
||||
if(MSVC)
|
||||
set(OpenBLAS_LIBNAME libopenblas)
|
||||
else()
|
||||
|
@ -30,11 +33,21 @@ set(NO_LAPACK 1)
|
|||
set(NO_LAPACKE 1)
|
||||
endif()
|
||||
|
||||
if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
|
||||
set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
|
||||
set(CMAKE_BUILD_TYPE
|
||||
Debug Debug
|
||||
Release Release
|
||||
)
|
||||
else()
|
||||
if( NOT CMAKE_BUILD_TYPE )
|
||||
if(BUILD_DEBUG)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
else()
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(BUILD_WITHOUT_CBLAS)
|
||||
set(NO_CBLAS 1)
|
||||
|
@ -109,7 +122,10 @@ endif ()
|
|||
#Set default output directory
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
if(MSVC)
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
|
||||
endif ()
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
|
@ -129,9 +145,12 @@ if (NOT NO_LAPACKE)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
#Only generate .def for dll on MSVC
|
||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
||||
if(MSVC)
|
||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
|
||||
endif()
|
||||
|
||||
# add objects to the openblas lib
|
||||
|
@ -141,11 +160,14 @@ include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
|||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
|
@ -153,6 +175,7 @@ add_subdirectory(utest)
|
|||
|
||||
if (NOT MSVC)
|
||||
#only build shared library for MSVC
|
||||
|
||||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
|
||||
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
|
||||
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
|
||||
|
@ -198,3 +221,73 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
|||
#endif
|
||||
# @touch lib.grd
|
||||
|
||||
# Install project
|
||||
|
||||
# Install libraries
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
|
||||
# Install include files
|
||||
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
|
||||
ADD_CUSTOM_COMMAND(
|
||||
OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
|
||||
)
|
||||
|
||||
ADD_CUSTOM_TARGET(genconfig
|
||||
ALL
|
||||
DEPENDS openblas_config.h
|
||||
)
|
||||
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
ADD_CUSTOM_TARGET(genf77blas
|
||||
ALL
|
||||
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||
)
|
||||
add_dependencies(genf77blas ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
ADD_CUSTOM_TARGET(gencblas
|
||||
ALL
|
||||
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
|
||||
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
|
||||
)
|
||||
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT MSVC)
|
||||
install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
endif()
|
||||
|
||||
include(FindPkgConfig QUIET)
|
||||
if(PKG_CONFIG_FOUND)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
endif()
|
||||
|
|
|
@ -161,3 +161,10 @@ In chronological order:
|
|||
* Kaustubh Raste <https://github.com/ksraste/>
|
||||
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
|
||||
* Abdelrauf <https://github.com/quickwritereader>
|
||||
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
|
||||
* [2017-02-26] ztrmm kernel for IBM z13
|
||||
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,45 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.20
|
||||
24-Jul-2017
|
||||
|
||||
common:
|
||||
* Improved CMake support
|
||||
* Fixed several thread race and locking bugs
|
||||
* Fixed default LAPACK optimization level
|
||||
* Updated LAPACK to 3.7.0
|
||||
* Added ReLAPACK (https://github.com/HPAC/ReLAPACK, make BUILD_RELAPACK=1)
|
||||
|
||||
POWER:
|
||||
* Optimizations for Power9
|
||||
* Fixed several Power8 assembly bugs
|
||||
|
||||
ARM:
|
||||
* New optimized Vulcan and ThunderX2T99 targets
|
||||
* Support for ARMV7 SOFT_FP ABI (make ARM_SOFTFP_ABI=1)
|
||||
* Detect all cpu cores including offline ones
|
||||
* Fix compilation with CLANG
|
||||
* Support building a shared library for Android
|
||||
|
||||
MIPS:
|
||||
* Fixed several threading issues
|
||||
* Fix compilation with CLANG
|
||||
|
||||
x86_64:
|
||||
* Detect Intel Bay Trail and Apollo Lake
|
||||
* Detect Intel Sky Lake and Kaby Lake
|
||||
* Detect Intel Knights Landing
|
||||
* Detect AMD A8, A10, A12 and Ryzen
|
||||
* Support 64bit builds with Visual Studio
|
||||
* Fix building with Intel and PGI compilers
|
||||
* Fix building with MINGW and TDM-GCC
|
||||
* Fix cmake builds for Haswell and related cpus
|
||||
* Fix building for Sandybridge with CLANG 3.9
|
||||
* Add support for the FLANG compiler
|
||||
|
||||
IBM Z:
|
||||
* New target z13 with BLAS3 optimizations
|
||||
|
||||
====================================================================
|
||||
Version 0.2.19
|
||||
1-Sep-2016
|
||||
|
|
28
Makefile
28
Makefile
|
@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
|
|||
SUBDIRS += lapack
|
||||
endif
|
||||
|
||||
RELA =
|
||||
ifeq ($(BUILD_RELAPACK), 1)
|
||||
RELA = re_lapack
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
|
||||
.PHONY : all libs netlib test ctest shared install
|
||||
.NOTPARALLEL : all libs prof lapack-test install blas-test
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
|
||||
all :: libs netlib tests shared
|
||||
all :: libs netlib $(RELA) tests shared
|
||||
@echo
|
||||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
||||
@echo
|
||||
|
@ -81,7 +86,7 @@ endif
|
|||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
|
@ -215,6 +220,14 @@ ifndef NO_LAPACKE
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(NO_LAPACK), 1)
|
||||
re_lapack :
|
||||
|
||||
else
|
||||
re_lapack :
|
||||
@$(MAKE) -C relapack
|
||||
endif
|
||||
|
||||
prof_lapack : lapack_prebuild
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||
|
||||
|
@ -278,13 +291,13 @@ lapack-timing : large.tgz timing.tgz
|
|||
ifndef NOFORTRAN
|
||||
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
||||
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
||||
make -C $(NETLIB_LAPACK_DIR)/TIMING
|
||||
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
|
||||
endif
|
||||
|
||||
|
||||
lapack-test :
|
||||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
||||
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
|
@ -299,7 +312,7 @@ lapack-runtest:
|
|||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
||||
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
|
||||
|
||||
|
||||
|
@ -326,6 +339,7 @@ endif
|
|||
@touch $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
|
||||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
|
||||
@$(MAKE) -C relapack clean
|
||||
@rm -f *.grd Makefile.conf_last config_last.h
|
||||
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
|
||||
@echo Done.
|
||||
|
|
30
Makefile.arm
30
Makefile.arm
|
@ -1,31 +1,19 @@
|
|||
# ifeq logical or
|
||||
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
else
|
||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV7)
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
|
||||
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
|
||||
else
|
||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV6)
|
||||
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
|
||||
FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
|
||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -marm -march=armv5
|
||||
FCOMMON_OPT += -marm -march=armv5
|
||||
CCOMMON_OPT += -march=armv5
|
||||
FCOMMON_OPT += -march=armv5
|
||||
endif
|
||||
|
|
|
@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
|||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VULCAN)
|
||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
endif
|
||||
|
|
|
@ -12,6 +12,7 @@ OPENBLAS_BUILD_DIR := $(CURDIR)
|
|||
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
|
||||
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
||||
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
|
||||
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
|
||||
|
||||
.PHONY : install
|
||||
.NOTPARALLEL : install
|
||||
|
@ -25,6 +26,7 @@ install : lib.grd
|
|||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
#for inc
|
||||
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
|
@ -50,7 +52,7 @@ ifndef NO_LAPACKE
|
|||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
|
@ -64,7 +66,7 @@ endif
|
|||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
|
@ -91,9 +93,20 @@ ifeq ($(OSNAME), WINNT)
|
|||
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
|
||||
@echo 'version='$(VERSION) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
|
||||
@echo 'extralib='$(EXTRALIB) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
|
||||
@cat openblas.pc.in >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
|
||||
|
||||
|
||||
#Generating OpenBLASConfig.cmake
|
||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
|
|
@ -43,7 +43,7 @@ endif
|
|||
|
||||
ifeq ($(USE_MASS), 1)
|
||||
# Path to MASS libs, change it if the libs are installed at any other location
|
||||
MASSPATH = /opt/ibm/xlmass/8.1.3/lib
|
||||
MASSPATH = /opt/ibm/xlmass/8.1.5/lib
|
||||
COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
|
||||
EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
|
||||
endif
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.19
|
||||
VERSION = 0.2.20
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -83,6 +83,9 @@ VERSION = 0.2.19
|
|||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
|
@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
NO_AFFINITY = 1
|
||||
#NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
|
|
@ -68,6 +68,9 @@ endif
|
|||
ifeq ($(TARGET), EXCAVATOR)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -98,6 +101,9 @@ endif
|
|||
ifeq ($(TARGET_CORE), EXCAVATOR)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -217,7 +223,9 @@ endif
|
|||
#
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.6
|
||||
endif
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
|
@ -234,6 +242,10 @@ EXTRALIB += -lm
|
|||
NO_EXPRECISION = 1
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
|
@ -406,7 +418,6 @@ CCOMMON_OPT += -fopenmp
|
|||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
$(error OpenBLAS: Clang didn't support OpenMP yet.)
|
||||
CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
|
@ -441,12 +452,13 @@ ifneq ($(NO_AVX), 1)
|
|||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
||||
endif
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += HASWELL
|
||||
DYNAMIC_CORE += HASWELL ZEN
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
DYNAMIC_ARCH =
|
||||
override DYNAMIC_ARCH=
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -474,6 +486,23 @@ endif
|
|||
ifeq ($(ARCH), arm)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
|
||||
CCOMMON_OPT += -marm
|
||||
FCOMMON_OPT += -marm
|
||||
|
||||
# If softfp abi is mentioned on the command line, force it.
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
CCOMMON_OPT += -mfloat-abi=softfp
|
||||
FCOMMON_OPT += -mfloat-abi=softfp
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
EXTRALIB += -lm
|
||||
else
|
||||
EXTRALIB += -Wl,-lm_hard
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), arm64)
|
||||
|
@ -575,6 +604,23 @@ endif
|
|||
# Fortran Compiler dependent settings
|
||||
#
|
||||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
ifdef BINARY64
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
FCOMMON_OPT += -Wall
|
||||
else
|
||||
FCOMMON_OPT += -Wall
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), G77)
|
||||
CCOMMON_OPT += -DF_INTERFACE_G77
|
||||
FCOMMON_OPT += -Wall
|
||||
|
@ -1002,7 +1048,7 @@ endif
|
|||
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||
|
||||
override FFLAGS += $(FCOMMON_OPT)
|
||||
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||
#MAKEOVERRIDES =
|
||||
|
||||
|
@ -1083,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
|
|||
ifneq ($(NO_LAPACKE), 1)
|
||||
LIB_COMPONENTS += LAPACKE
|
||||
endif
|
||||
ifeq ($(BUILD_RELAPACK), 1)
|
||||
LIB_COMPONENTS += ReLAPACK
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ONLY_CBLAS), 1)
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
ifeq ($(CORE), Z13)
|
||||
CCOMMON_OPT += -march=z13 -mzvector
|
||||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
22
README.md
22
README.md
|
@ -51,18 +51,18 @@ The library can be installed as below -
|
|||
|
||||
* On Ubuntu:
|
||||
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install libxlmass-devel.8.1.3
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
|
||||
sudo apt-get update</br>
|
||||
sudo apt-get install libxlmass-devel.8.1.5</br>
|
||||
|
||||
* On RHEL/CentOS:
|
||||
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||
sudo rpm --import repomd.xml.key
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||
sudo yum install libxlmass-devel.8.1.3
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
|
||||
sudo rpm --import repomd.xml.key</br>
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
|
||||
sudo yum install libxlmass-devel.8.1.5</br>
|
||||
|
||||
After installing MASS library, compile openblas with USE_MASS=1.
|
||||
|
||||
|
@ -106,6 +106,10 @@ Please read GotoBLAS_01Readme.txt
|
|||
- **ARMV8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **Z13**: Optimized Level-3 BLAS
|
||||
|
||||
|
||||
### Support OS:
|
||||
- **GNU/Linux**
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
|
|
|
@ -34,6 +34,7 @@ BULLDOZER
|
|||
PILEDRIVER
|
||||
STEAMROLLER
|
||||
EXCAVATOR
|
||||
ZEN
|
||||
|
||||
c)VIA CPU:
|
||||
SSE_GENERIC
|
||||
|
@ -80,4 +81,7 @@ ARMV5
|
|||
8.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
CORTEXA57
|
||||
VULCAN
|
||||
THUNDERX
|
||||
THUNDERX2T99
|
||||
|
||||
|
|
|
@ -37,6 +37,18 @@ ESSL=/opt/ibm/lib
|
|||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
|
||||
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
|
||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
csymv.goto zsymv.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto
|
||||
else
|
||||
GOTO_LAPACK_TARGETS=
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
|
@ -147,9 +159,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
|||
|
||||
else
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
|
||||
sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
||||
goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
||||
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
|
||||
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
|
||||
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
|
||||
|
@ -162,20 +172,16 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
sscal.goto dscal.goto cscal.goto zscal.goto \
|
||||
sasum.goto dasum.goto casum.goto zasum.goto \
|
||||
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
|
||||
ssymv.goto dsymv.goto \
|
||||
chemv.goto zhemv.goto \
|
||||
chemm.goto zhemm.goto \
|
||||
cherk.goto zherk.goto \
|
||||
cher2k.goto zher2k.goto \
|
||||
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
|
||||
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
|
||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling \
|
||||
isamax.goto idamax.goto icamax.goto izamax.goto \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
|
|
@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
|
|||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Time\n");
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
@ -180,7 +180,9 @@ int main(int argc, char *argv[]){
|
|||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.6f secs\n", timeg);
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
|
|||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Time\n");
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
@ -180,7 +180,10 @@ int main(int argc, char *argv[]){
|
|||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.6f secs\n", timeg);
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -2,15 +2,13 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
nfrom = 128
|
||||
nto = 2048
|
||||
nstep = 128
|
||||
loops = 1
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
loops <- 1
|
||||
|
||||
if (length(argv) > 0) {
|
||||
|
||||
for (z in 1:length(argv)) {
|
||||
|
||||
if (z == 1) {
|
||||
nfrom <- as.numeric(argv[z])
|
||||
} else if (z == 2) {
|
||||
|
@ -24,39 +22,34 @@ if ( length(argv) > 0 ) {
|
|||
|
||||
}
|
||||
|
||||
p=Sys.getenv("OPENBLAS_LOOPS")
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
if (p != "") {
|
||||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n = nfrom
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
|
||||
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
|
||||
|
||||
l = 1
|
||||
|
||||
start <- proc.time()[3]
|
||||
|
||||
while ( l <= loops ) {
|
||||
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
ev <- 0
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
ev <- eigen(A)
|
||||
l = l + 1
|
||||
}
|
||||
})
|
||||
|
||||
end <- proc.time()[3]
|
||||
timeg = end - start
|
||||
mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 )
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
|
||||
st = sprintf("%.0fx%.0f :",n , n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n = n + nstep
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -2,15 +2,13 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
nfrom = 128
|
||||
nto = 2048
|
||||
nstep = 128
|
||||
loops = 1
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
loops <- 1
|
||||
|
||||
if (length(argv) > 0) {
|
||||
|
||||
for (z in 1:length(argv)) {
|
||||
|
||||
if (z == 1) {
|
||||
nfrom <- as.numeric(argv[z])
|
||||
} else if (z == 2) {
|
||||
|
@ -24,40 +22,43 @@ if ( length(argv) > 0 ) {
|
|||
|
||||
}
|
||||
|
||||
p=Sys.getenv("OPENBLAS_LOOPS")
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
if (p != "") {
|
||||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n = nfrom
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
B <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
C <- 1
|
||||
|
||||
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
|
||||
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
|
||||
|
||||
l = 1
|
||||
|
||||
start <- proc.time()[3]
|
||||
|
||||
while ( l <= loops ) {
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
C <- A %*% B
|
||||
l = l + 1
|
||||
}
|
||||
l <- l + 1
|
||||
})
|
||||
|
||||
end <- proc.time()[3]
|
||||
timeg = end - start
|
||||
mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
|
||||
st = sprintf("%.0fx%.0f :",n , n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n = n + nstep
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -2,15 +2,13 @@
|
|||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
nfrom = 128
|
||||
nto = 2048
|
||||
nstep = 128
|
||||
loops = 1
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
loops <- 1
|
||||
|
||||
if (length(argv) > 0) {
|
||||
|
||||
for (z in 1:length(argv)) {
|
||||
|
||||
if (z == 1) {
|
||||
nfrom <- as.numeric(argv[z])
|
||||
} else if (z == 2) {
|
||||
|
@ -24,40 +22,36 @@ if ( length(argv) > 0 ) {
|
|||
|
||||
}
|
||||
|
||||
p=Sys.getenv("OPENBLAS_LOOPS")
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
if (p != "") {
|
||||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n = nfrom
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
|
||||
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
|
||||
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
|
||||
|
||||
l = 1
|
||||
|
||||
start <- proc.time()[3]
|
||||
|
||||
while ( l <= loops ) {
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
solve(A, B)
|
||||
l = l + 1
|
||||
}
|
||||
})
|
||||
|
||||
end <- proc.time()[3]
|
||||
timeg = end - start
|
||||
mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
|
||||
mflops <-
|
||||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
|
||||
st = sprintf("%.0fx%.0f :",n , n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n = n + nstep
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
15
c_check
15
c_check
|
@ -10,6 +10,7 @@ $hostarch = "x86_64" if ($hostarch eq "amd64");
|
|||
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
@ -34,7 +35,7 @@ if (dirname($compiler_name) ne ".") {
|
|||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /(.*-)(.*)/) {
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
|
||||
|
@ -72,6 +73,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
|
|||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
|
@ -96,6 +98,11 @@ if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
|||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "zarch") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
|
@ -187,6 +194,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
|
|||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
@ -234,6 +242,11 @@ $linker_a = "";
|
|||
$linker_L .= "-Wl,". $flags . " "
|
||||
}
|
||||
|
||||
if ($flags =~ /^\--exclude-libs/) {
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
$flags="";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
|
|
|
@ -73,7 +73,7 @@ if (DYNAMIC_ARCH)
|
|||
set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER")
|
||||
endif ()
|
||||
if (NOT NO_AVX2)
|
||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL")
|
||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL ZEN")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -73,6 +73,10 @@ if (${ARCH} STREQUAL "X86")
|
|||
set(ARCH x86)
|
||||
endif ()
|
||||
|
||||
if (${ARCH} MATCHES "ppc")
|
||||
set(ARCH power)
|
||||
endif ()
|
||||
|
||||
set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
|
||||
if (${COMPILER_ID} STREQUAL "GNU")
|
||||
set(COMPILER_ID "GCC")
|
||||
|
@ -87,3 +91,8 @@ file(WRITE ${TARGET_CONF}
|
|||
"#define __${BINARY}BIT__\t1\n"
|
||||
"#define FUNDERSCORE\t${FU}\n")
|
||||
|
||||
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
|
||||
file(APPEND ${TARGET_CONF}
|
||||
"#define OS_WINNT\t1\n")
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -3,6 +3,21 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64)
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
endif ()
|
||||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "G77")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
set(ALLAUX
|
||||
ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f
|
||||
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f
|
||||
ilaprec.f ilatrans.f ilauplo.f iladiag.f iparam2stage.F chla_transtype.f
|
||||
../INSTALL/ilaver.f ../INSTALL/slamch.f
|
||||
)
|
||||
|
||||
|
@ -26,7 +26,7 @@ set(SCLAUX
|
|||
)
|
||||
|
||||
set(DZLAUX
|
||||
dbdsdc.f
|
||||
dbdsdc.f dbdsvdx.f
|
||||
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
|
||||
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
|
||||
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
|
||||
|
@ -42,20 +42,28 @@ set(DZLAUX
|
|||
dsteqr.f dsterf.f dlaisnan.f disnan.f
|
||||
dlartgp.f dlartgs.f
|
||||
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f
|
||||
dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f
|
||||
dgetsls.f dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f
|
||||
dtplqt2.f dtpmlqt.f dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f
|
||||
dsytf2_rk.f dlasyf_rk.f dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f
|
||||
dsytri_3x.f dsysv_rk.f dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f
|
||||
dsbevx_2stage.f dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f
|
||||
dsyevx_2stage.f dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F
|
||||
dsytrd_sy2sb.f dlarfy.f
|
||||
)
|
||||
|
||||
set(SLASRC
|
||||
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
|
||||
sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
|
||||
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
|
||||
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
|
||||
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
|
||||
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
|
||||
sgetc2.f sgetri.f
|
||||
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
|
||||
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f
|
||||
sgetc2.f sgetri.f sgetrf2.f
|
||||
sggbak.f sggbal.f sgghd3.f sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f
|
||||
sggglm.f sgghrd.f sgglse.f sggqrf.f
|
||||
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sggrqf.f DEPRECATED/sggsvd.f sggsvd3.f DEPRECATED/sggsvp.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
|
||||
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
|
||||
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
|
||||
|
@ -72,7 +80,7 @@ set(SLASRC
|
|||
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
|
||||
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
|
||||
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
|
||||
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
|
||||
sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f
|
||||
sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
|
||||
sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
|
||||
spbstf.f spbsv.f spbsvx.f
|
||||
|
@ -96,7 +104,7 @@ set(SLASRC
|
|||
stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f
|
||||
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
|
||||
stptrs.f
|
||||
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
|
||||
strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
|
||||
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
|
||||
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
|
||||
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
|
||||
|
@ -106,9 +114,16 @@ set(SLASRC
|
|||
sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f
|
||||
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
|
||||
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f
|
||||
sgelq.f sgelqt.f sgelqt3.f sgemlq.f sgemlqt.f sgemqr.f sgeqr.f sgetsls.f
|
||||
slamswlq.f slamtsqr.f slaswlq.f slatsqr.f stplqt.f stplqt2.f stpmlqt.f
|
||||
ssysv_aa.f ssytrf_aa.f ssytrs_aa.f slasyf_aa.f ssytf2_rk.f slasyf_rk.f
|
||||
ssytrf_rk.f ssytrs_3.f ssycon_3.f ssytri_3.f ssytri_3x.f ssysv_rk.f
|
||||
ssb2st_kernels.f ssbev_2stage.f ssbevd_2stage.f ssbevx_2stage.f
|
||||
ssyev_2stage.f ssyevd_2stage.f ssyevr_2stage.f ssyevx_2stage.f
|
||||
ssygv_2stage.f ssytrd_2stage.f ssytrd_sb2st.F ssytrd_sy2sb.f slarfy.f
|
||||
)
|
||||
|
||||
set(DSLASRC spotrs.f)
|
||||
set(DSLASRC spotrs.f spotrf2.f)
|
||||
|
||||
set(CLASRC
|
||||
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
|
||||
|
@ -165,7 +180,7 @@ set(CLASRC
|
|||
ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f
|
||||
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
|
||||
ctprfs.f ctptri.f
|
||||
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
|
||||
ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
|
||||
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
|
||||
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
|
||||
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
|
||||
|
@ -178,6 +193,14 @@ set(CLASRC
|
|||
cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f
|
||||
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
|
||||
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f
|
||||
cgelq.f cgelqt.f cgelqt3.f cgemlq.f cgemlqt.f cgemqr.f cgeqr.f cgetsls.f
|
||||
clamswlq.f clamtsqr.f claswlq.f clatsqr.f ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chesv_aa.f chetrf_aa.f chetrs_aa.f clahef_aa.f csytf2_rk.f clasyf_rk.f
|
||||
csytrf_rk.f csytrs_3.f csycon_3.f csytri_3.f csytri_3x.f csysv_rk.f
|
||||
chetf2_rk.f clahef_rk.f chetrf_rk.f chetrs_3.f checon_3.f chetri_3.f
|
||||
chetri_3x.f chesv_rk.f chb2st_kernels.f chbev_2stage.f chbevd_2stage.f
|
||||
chbevx_2stage.f cheev_2stage.f cheevd_2stage.f cheevr_2stage.f cheevx_2stage.f
|
||||
chegv_2stage.f chetrd_2stage.f chetrd_hb2st.F chetrd_he2hb.f clarfy.f
|
||||
)
|
||||
|
||||
set(ZCLASRC cpotrs.f)
|
||||
|
@ -189,11 +212,11 @@ set(DLASRC
|
|||
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
|
||||
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
|
||||
dgetc2.f dgetri.f
|
||||
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
|
||||
dggglm.f dgghrd.f dgglse.f dggqrf.f
|
||||
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f
|
||||
dgetc2.f dgetri.f dgetrf2.f
|
||||
dggbak.f dggbal.f dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f
|
||||
dggglm.f dgghd3.f dgghrd.f dgglse.f dggqrf.f
|
||||
dggrqf.f dggsvd3.f dggsvp3.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
|
||||
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
|
||||
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
|
||||
|
@ -210,12 +233,12 @@ set(DLASRC
|
|||
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
|
||||
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
|
||||
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
|
||||
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
|
||||
dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f
|
||||
dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
|
||||
dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
|
||||
dpbstf.f dpbsv.f dpbsvx.f
|
||||
dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f
|
||||
dposvx.f dpotrs.f dpstrf.f dpstf2.f
|
||||
dposvx.f dpotrf2.f dpotrs.f dpstrf.f dpstf2.f
|
||||
dppcon.f dppequ.f
|
||||
dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f
|
||||
dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f
|
||||
|
@ -234,7 +257,7 @@ set(DLASRC
|
|||
dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f
|
||||
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
|
||||
dtptrs.f
|
||||
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
|
||||
dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
|
||||
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
|
||||
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
|
||||
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
|
||||
|
@ -245,20 +268,28 @@ set(DLASRC
|
|||
dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f
|
||||
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
|
||||
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f
|
||||
dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f dgetsls.f
|
||||
dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f dsytf2_rk.f dlasyf_rk.f
|
||||
dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f dsytri_3x.f dsysv_rk.f
|
||||
dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f dsbevx_2stage.f
|
||||
dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f dsyevx_2stage.f
|
||||
dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F dsytrd_sy2sb.f dlarfy.f
|
||||
)
|
||||
|
||||
set(ZLASRC
|
||||
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
|
||||
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
|
||||
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
|
||||
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgejsv.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
|
||||
zgetri.f
|
||||
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
|
||||
zgghrd.f zgglse.f zggqrf.f zggrqf.f
|
||||
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
|
||||
zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvj.f zgesvx.f zgetc2.f
|
||||
zgetri.f zgetrf2.f
|
||||
zggbak.f zggbal.f zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f zggglm.f
|
||||
zgghd3.f zgghrd.f zgglse.f zggqrf.f zggrqf.f
|
||||
DEPRECATED/zggsvd.f zggsvd3.f DEPRECATED/zggsvp.f zggsvp3.f
|
||||
zgsvj0.f zgsvj1.f
|
||||
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
|
||||
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
|
||||
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
|
||||
|
@ -287,28 +318,28 @@ set(ZLASRC
|
|||
zlarfg.f zlarft.f zlarfgp.f
|
||||
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f zlasyf_aa.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
|
||||
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
|
||||
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
|
||||
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
|
||||
zposv.f zposvx.f zpotrf2.f zpotrs.f zpstrf.f zpstf2.f
|
||||
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
|
||||
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
|
||||
zrot.f zspcon.f zsprfs.f zspsv.f
|
||||
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
|
||||
zstegr.f zstein.f zsteqr.f
|
||||
zsycon.f
|
||||
zsycon.f zsysv_aa.f
|
||||
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f
|
||||
zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f
|
||||
zsyswapr.f zsytrs.f zsytrs_aa.f zsytrs2.f zsyconv.f
|
||||
zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f
|
||||
zsytri_rook.f zsycon_rook.f zsysv_rook.f
|
||||
ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f
|
||||
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
|
||||
ztprfs.f ztptri.f
|
||||
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
|
||||
ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
|
||||
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
|
||||
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
|
||||
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
|
||||
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunm22.f zunml2.f
|
||||
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
|
||||
zunmtr.f zupgtr.f
|
||||
zupmtr.f izmax1.f dzsum1.f zstemr.f
|
||||
|
@ -320,6 +351,15 @@ set(ZLASRC
|
|||
zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f
|
||||
zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f
|
||||
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f
|
||||
zgelq.f zgelqt.f zgelqt3.f zgemlq.f zgemlqt.f zgemqr.f zgeqr.f zgetsls.f
|
||||
zlamswlq.f zlamtsqr.f zlaswlq.f zlatsqr.f ztplqt.f ztplqt2.f ztpmlqt.f
|
||||
zhesv_aa.f zhetrf_aa.f zhetrs_aa.f zlahef_aa.f zsytf2_rk.f zlasyf_rk.f
|
||||
zsytrf_aa.f zsytrf_rk.f zsytrs_3.f zsycon_3.f zsytri_3.f zsytri_3x.f zsysv_rk.f
|
||||
zhetf2_rk.f zlahef_rk.f zhetrf_rk.f zhetrs_3.f zhecon_3.f zhetri_3.f
|
||||
zhetri_3x.f zhesv_rk.f zhb2st_kernels.f zhbev_2stage.f zhbevd_2stage.f
|
||||
zhbevx_2stage.f zheev_2stage.f zheevd_2stage.f zheevr_2stage.f
|
||||
zheevx_2stage.f zhegv_2stage.f zhetrd_2stage.f zhetrd_hb2st.F zhetrd_he2hb.f
|
||||
zlarfy.f
|
||||
)
|
||||
|
||||
set(LA_REL_SRC ${ALLAUX})
|
||||
|
|
|
@ -44,6 +44,8 @@ set(C_SRC
|
|||
lapacke_cgeevx_work.c
|
||||
lapacke_cgehrd.c
|
||||
lapacke_cgehrd_work.c
|
||||
lapacke_cgejsv.c
|
||||
lapacke_cgejsv_work.c
|
||||
lapacke_cgelq2.c
|
||||
lapacke_cgelq2_work.c
|
||||
lapacke_cgelqf.c
|
||||
|
@ -56,14 +58,14 @@ set(C_SRC
|
|||
lapacke_cgelss_work.c
|
||||
lapacke_cgelsy.c
|
||||
lapacke_cgelsy_work.c
|
||||
lapacke_cgemqr.c
|
||||
lapacke_cgemqr_work.c
|
||||
lapacke_cgemqrt.c
|
||||
lapacke_cgemqrt_work.c
|
||||
lapacke_cgeqlf.c
|
||||
lapacke_cgeqlf_work.c
|
||||
lapacke_cgeqp3.c
|
||||
lapacke_cgeqp3_work.c
|
||||
lapacke_cgeqpf.c
|
||||
lapacke_cgeqpf_work.c
|
||||
lapacke_cgeqr2.c
|
||||
lapacke_cgeqr2_work.c
|
||||
lapacke_cgeqrf.c
|
||||
|
@ -86,42 +88,56 @@ set(C_SRC
|
|||
lapacke_cgesv_work.c
|
||||
lapacke_cgesvd.c
|
||||
lapacke_cgesvd_work.c
|
||||
lapacke_cgesvdx.c
|
||||
lapacke_cgesvdx_work.c
|
||||
lapacke_cgesvj.c
|
||||
lapacke_cgesvj_work.c
|
||||
lapacke_cgesvx.c
|
||||
lapacke_cgesvx_work.c
|
||||
lapacke_cgetf2.c
|
||||
lapacke_cgetf2_work.c
|
||||
lapacke_cgetrf.c
|
||||
lapacke_cgetrf_work.c
|
||||
lapacke_cgetrf2.c
|
||||
lapacke_cgetrf2_work.c
|
||||
lapacke_cgetri.c
|
||||
lapacke_cgetri_work.c
|
||||
lapacke_cgetrs.c
|
||||
lapacke_cgetrs_work.c
|
||||
lapacke_cgetsls.c
|
||||
lapacke_cgetsls_work.c
|
||||
lapacke_cggbak.c
|
||||
lapacke_cggbak_work.c
|
||||
lapacke_cggbal.c
|
||||
lapacke_cggbal_work.c
|
||||
lapacke_cgges.c
|
||||
lapacke_cgges_work.c
|
||||
lapacke_cgges3.c
|
||||
lapacke_cgges3_work.c
|
||||
lapacke_cggesx.c
|
||||
lapacke_cggesx_work.c
|
||||
lapacke_cggev.c
|
||||
lapacke_cggev_work.c
|
||||
lapacke_cggev3.c
|
||||
lapacke_cggev3_work.c
|
||||
lapacke_cggevx.c
|
||||
lapacke_cggevx_work.c
|
||||
lapacke_cggglm.c
|
||||
lapacke_cggglm_work.c
|
||||
lapacke_cgghrd.c
|
||||
lapacke_cgghrd_work.c
|
||||
lapacke_cgghd3.c
|
||||
lapacke_cgghd3_work.c
|
||||
lapacke_cgglse.c
|
||||
lapacke_cgglse_work.c
|
||||
lapacke_cggqrf.c
|
||||
lapacke_cggqrf_work.c
|
||||
lapacke_cggrqf.c
|
||||
lapacke_cggrqf_work.c
|
||||
lapacke_cggsvd.c
|
||||
lapacke_cggsvd_work.c
|
||||
lapacke_cggsvp.c
|
||||
lapacke_cggsvp_work.c
|
||||
lapacke_cggsvd3.c
|
||||
lapacke_cggsvd3_work.c
|
||||
lapacke_cggsvp3.c
|
||||
lapacke_cggsvp3_work.c
|
||||
lapacke_cgtcon.c
|
||||
lapacke_cgtcon_work.c
|
||||
lapacke_cgtrfs.c
|
||||
|
@ -140,6 +156,12 @@ set(C_SRC
|
|||
lapacke_chbevd_work.c
|
||||
lapacke_chbevx.c
|
||||
lapacke_chbevx_work.c
|
||||
lapacke_chbev_2stage.c
|
||||
lapacke_chbev_2stage_work.c
|
||||
lapacke_chbevd_2stage.c
|
||||
lapacke_chbevd_2stage_work.c
|
||||
lapacke_chbevx_2stage.c
|
||||
lapacke_chbevx_2stage_work.c
|
||||
lapacke_chbgst.c
|
||||
lapacke_chbgst_work.c
|
||||
lapacke_chbgv.c
|
||||
|
@ -152,6 +174,8 @@ set(C_SRC
|
|||
lapacke_chbtrd_work.c
|
||||
lapacke_checon.c
|
||||
lapacke_checon_work.c
|
||||
lapacke_checon_3.c
|
||||
lapacke_checon_3_work.c
|
||||
lapacke_cheequb.c
|
||||
lapacke_cheequb_work.c
|
||||
lapacke_cheev.c
|
||||
|
@ -162,10 +186,20 @@ set(C_SRC
|
|||
lapacke_cheevr_work.c
|
||||
lapacke_cheevx.c
|
||||
lapacke_cheevx_work.c
|
||||
lapacke_cheev_2stage.c
|
||||
lapacke_cheev_2stage_work.c
|
||||
lapacke_cheevd_2stage.c
|
||||
lapacke_cheevd_2stage_work.c
|
||||
lapacke_cheevr_2stage.c
|
||||
lapacke_cheevr_2stage_work.c
|
||||
lapacke_cheevx_2stage.c
|
||||
lapacke_cheevx_2stage_work.c
|
||||
lapacke_chegst.c
|
||||
lapacke_chegst_work.c
|
||||
lapacke_chegv.c
|
||||
lapacke_chegv_work.c
|
||||
lapacke_chegv_2stage.c
|
||||
lapacke_chegv_2stage_work.c
|
||||
lapacke_chegvd.c
|
||||
lapacke_chegvd_work.c
|
||||
lapacke_chegvx.c
|
||||
|
@ -174,6 +208,10 @@ set(C_SRC
|
|||
lapacke_cherfs_work.c
|
||||
lapacke_chesv.c
|
||||
lapacke_chesv_work.c
|
||||
lapacke_chesv_aa.c
|
||||
lapacke_chesv_aa_work.c
|
||||
lapacke_chesv_rk.c
|
||||
lapacke_chesv_rk_work.c
|
||||
lapacke_chesvx.c
|
||||
lapacke_chesvx_work.c
|
||||
lapacke_cheswapr.c
|
||||
|
@ -181,17 +219,31 @@ set(C_SRC
|
|||
lapacke_chetrd.c
|
||||
lapacke_chetrd_work.c
|
||||
lapacke_chetrf.c
|
||||
lapacke_chetrf_rook.c
|
||||
lapacke_chetrf_work.c
|
||||
lapacke_chetrf_rook_work.c
|
||||
lapacke_chetrf_aa.c
|
||||
lapacke_chetrf_aa_work.c
|
||||
lapacke_chetrf_rk.c
|
||||
lapacke_chetrf_rk_work.c
|
||||
lapacke_chetri.c
|
||||
lapacke_chetri2.c
|
||||
lapacke_chetri2_work.c
|
||||
lapacke_chetri_3.c
|
||||
lapacke_chetri_3_work.c
|
||||
lapacke_chetri2x.c
|
||||
lapacke_chetri2x_work.c
|
||||
lapacke_chetri_work.c
|
||||
lapacke_chetrs.c
|
||||
lapacke_chetrs_rook.c
|
||||
lapacke_chetrs2.c
|
||||
lapacke_chetrs2_work.c
|
||||
lapacke_chetrs_work.c
|
||||
lapacke_chetrs_rook_work.c
|
||||
lapacke_chetrs_aa.c
|
||||
lapacke_chetrs_aa_work.c
|
||||
lapacke_chetrs_3.c
|
||||
lapacke_chetrs_3_work.c
|
||||
lapacke_chfrk.c
|
||||
lapacke_chfrk_work.c
|
||||
lapacke_chgeqz.c
|
||||
|
@ -250,6 +302,8 @@ set(C_SRC
|
|||
lapacke_clantr_work.c
|
||||
lapacke_clapmr.c
|
||||
lapacke_clapmr_work.c
|
||||
lapacke_clapmt.c
|
||||
lapacke_clapmt_work.c
|
||||
lapacke_clarfb.c
|
||||
lapacke_clarfb_work.c
|
||||
lapacke_clarfg.c
|
||||
|
@ -260,6 +314,8 @@ set(C_SRC
|
|||
lapacke_clarfx_work.c
|
||||
lapacke_clarnv.c
|
||||
lapacke_clarnv_work.c
|
||||
lapacke_clascl.c
|
||||
lapacke_clascl_work.c
|
||||
lapacke_claset.c
|
||||
lapacke_claset_work.c
|
||||
lapacke_claswp.c
|
||||
|
@ -302,6 +358,8 @@ set(C_SRC
|
|||
lapacke_cposvx_work.c
|
||||
lapacke_cpotrf.c
|
||||
lapacke_cpotrf_work.c
|
||||
lapacke_cpotrf2.c
|
||||
lapacke_cpotrf2_work.c
|
||||
lapacke_cpotri.c
|
||||
lapacke_cpotri_work.c
|
||||
lapacke_cpotrs.c
|
||||
|
@ -364,6 +422,8 @@ set(C_SRC
|
|||
lapacke_csteqr_work.c
|
||||
lapacke_csycon.c
|
||||
lapacke_csycon_work.c
|
||||
lapacke_csycon_3.c
|
||||
lapacke_csycon_3_work.c
|
||||
lapacke_csyconv.c
|
||||
lapacke_csyconv_work.c
|
||||
lapacke_csyequb.c
|
||||
|
@ -374,22 +434,40 @@ set(C_SRC
|
|||
lapacke_csysv_rook.c
|
||||
lapacke_csysv_rook_work.c
|
||||
lapacke_csysv_work.c
|
||||
lapacke_csysv_aa.c
|
||||
lapacke_csysv_aa_work.c
|
||||
lapacke_csysv_rk.c
|
||||
lapacke_csysv_rk_work.c
|
||||
lapacke_csysvx.c
|
||||
lapacke_csysvx_work.c
|
||||
lapacke_csyswapr.c
|
||||
lapacke_csyswapr_work.c
|
||||
lapacke_csytrf.c
|
||||
lapacke_csytrf_work.c
|
||||
lapacke_csytrf_rook.c
|
||||
lapacke_csytrf_rook_work.c
|
||||
lapacke_csytrf_aa.c
|
||||
lapacke_csytrf_aa_work.c
|
||||
lapacke_csytrf_rk.c
|
||||
lapacke_csytrf_rk_work.c
|
||||
lapacke_csytri.c
|
||||
lapacke_csytri2.c
|
||||
lapacke_csytri2_work.c
|
||||
lapacke_csytri_3.c
|
||||
lapacke_csytri_3_work.c
|
||||
lapacke_csytri2x.c
|
||||
lapacke_csytri2x_work.c
|
||||
lapacke_csytri_work.c
|
||||
lapacke_csytrs.c
|
||||
lapacke_csytrs_rook.c
|
||||
lapacke_csytrs2.c
|
||||
lapacke_csytrs2_work.c
|
||||
lapacke_csytrs_work.c
|
||||
lapacke_csytrs_rook_work.c
|
||||
lapacke_csytrs_aa.c
|
||||
lapacke_csytrs_aa_work.c
|
||||
lapacke_csytrs_3.c
|
||||
lapacke_csytrs_3_work.c
|
||||
lapacke_ctbcon.c
|
||||
lapacke_ctbcon_work.c
|
||||
lapacke_ctbrfs.c
|
||||
|
@ -464,6 +542,8 @@ set(C_SRC
|
|||
lapacke_cunbdb_work.c
|
||||
lapacke_cuncsd.c
|
||||
lapacke_cuncsd_work.c
|
||||
lapacke_cuncsd2by1.c
|
||||
lapacke_cuncsd2by1_work.c
|
||||
lapacke_cungbr.c
|
||||
lapacke_cungbr_work.c
|
||||
lapacke_cunghr.c
|
||||
|
@ -505,6 +585,8 @@ set(DSRC
|
|||
lapacke_dbbcsd_work.c
|
||||
lapacke_dbdsdc.c
|
||||
lapacke_dbdsdc_work.c
|
||||
lapacke_dbdsvdx.c
|
||||
lapacke_dbdsvdx_work.c
|
||||
lapacke_dbdsqr.c
|
||||
lapacke_dbdsqr_work.c
|
||||
lapacke_ddisna.c
|
||||
|
@ -563,14 +645,14 @@ set(DSRC
|
|||
lapacke_dgelss_work.c
|
||||
lapacke_dgelsy.c
|
||||
lapacke_dgelsy_work.c
|
||||
lapacke_dgemqr.c
|
||||
lapacke_dgemqr_work.c
|
||||
lapacke_dgemqrt.c
|
||||
lapacke_dgemqrt_work.c
|
||||
lapacke_dgeqlf.c
|
||||
lapacke_dgeqlf_work.c
|
||||
lapacke_dgeqp3.c
|
||||
lapacke_dgeqp3_work.c
|
||||
lapacke_dgeqpf.c
|
||||
lapacke_dgeqpf_work.c
|
||||
lapacke_dgeqr2.c
|
||||
lapacke_dgeqr2_work.c
|
||||
lapacke_dgeqrf.c
|
||||
|
@ -593,6 +675,8 @@ set(DSRC
|
|||
lapacke_dgesv_work.c
|
||||
lapacke_dgesvd.c
|
||||
lapacke_dgesvd_work.c
|
||||
lapacke_dgesvdx.c
|
||||
lapacke_dgesvdx_work.c
|
||||
lapacke_dgesvj.c
|
||||
lapacke_dgesvj_work.c
|
||||
lapacke_dgesvx.c
|
||||
|
@ -601,36 +685,46 @@ set(DSRC
|
|||
lapacke_dgetf2_work.c
|
||||
lapacke_dgetrf.c
|
||||
lapacke_dgetrf_work.c
|
||||
lapacke_dgetrf2.c
|
||||
lapacke_dgetrf2_work.c
|
||||
lapacke_dgetri.c
|
||||
lapacke_dgetri_work.c
|
||||
lapacke_dgetrs.c
|
||||
lapacke_dgetrs_work.c
|
||||
lapacke_dgetsls.c
|
||||
lapacke_dgetsls_work.c
|
||||
lapacke_dggbak.c
|
||||
lapacke_dggbak_work.c
|
||||
lapacke_dggbal.c
|
||||
lapacke_dggbal_work.c
|
||||
lapacke_dgges.c
|
||||
lapacke_dgges_work.c
|
||||
lapacke_dgges3.c
|
||||
lapacke_dgges3_work.c
|
||||
lapacke_dggesx.c
|
||||
lapacke_dggesx_work.c
|
||||
lapacke_dggev.c
|
||||
lapacke_dggev_work.c
|
||||
lapacke_dggev3.c
|
||||
lapacke_dggev3_work.c
|
||||
lapacke_dggevx.c
|
||||
lapacke_dggevx_work.c
|
||||
lapacke_dggglm.c
|
||||
lapacke_dggglm_work.c
|
||||
lapacke_dgghrd.c
|
||||
lapacke_dgghrd_work.c
|
||||
lapacke_dgghd3.c
|
||||
lapacke_dgghd3_work.c
|
||||
lapacke_dgglse.c
|
||||
lapacke_dgglse_work.c
|
||||
lapacke_dggqrf.c
|
||||
lapacke_dggqrf_work.c
|
||||
lapacke_dggrqf.c
|
||||
lapacke_dggrqf_work.c
|
||||
lapacke_dggsvd.c
|
||||
lapacke_dggsvd_work.c
|
||||
lapacke_dggsvp.c
|
||||
lapacke_dggsvp_work.c
|
||||
lapacke_dggsvd3.c
|
||||
lapacke_dggsvd3_work.c
|
||||
lapacke_dggsvp3.c
|
||||
lapacke_dggsvp3_work.c
|
||||
lapacke_dgtcon.c
|
||||
lapacke_dgtcon_work.c
|
||||
lapacke_dgtrfs.c
|
||||
|
@ -665,6 +759,8 @@ set(DSRC
|
|||
lapacke_dlantr_work.c
|
||||
lapacke_dlapmr.c
|
||||
lapacke_dlapmr_work.c
|
||||
lapacke_dlapmt.c
|
||||
lapacke_dlapmt_work.c
|
||||
lapacke_dlapy2.c
|
||||
lapacke_dlapy2_work.c
|
||||
lapacke_dlapy3.c
|
||||
|
@ -683,6 +779,8 @@ set(DSRC
|
|||
lapacke_dlartgp_work.c
|
||||
lapacke_dlartgs.c
|
||||
lapacke_dlartgs_work.c
|
||||
lapacke_dlascl.c
|
||||
lapacke_dlascl_work.c
|
||||
lapacke_dlaset.c
|
||||
lapacke_dlaset_work.c
|
||||
lapacke_dlasrt.c
|
||||
|
@ -697,6 +795,8 @@ set(DSRC
|
|||
lapacke_dopmtr_work.c
|
||||
lapacke_dorbdb.c
|
||||
lapacke_dorbdb_work.c
|
||||
lapacke_dorcsd2by1.c
|
||||
lapacke_dorcsd2by1_work.c
|
||||
lapacke_dorcsd.c
|
||||
lapacke_dorcsd_work.c
|
||||
lapacke_dorgbr.c
|
||||
|
@ -765,6 +865,8 @@ set(DSRC
|
|||
lapacke_dposvx_work.c
|
||||
lapacke_dpotrf.c
|
||||
lapacke_dpotrf_work.c
|
||||
lapacke_dpotrf2.c
|
||||
lapacke_dpotrf2_work.c
|
||||
lapacke_dpotri.c
|
||||
lapacke_dpotri_work.c
|
||||
lapacke_dpotrs.c
|
||||
|
@ -807,6 +909,12 @@ set(DSRC
|
|||
lapacke_dsbevd_work.c
|
||||
lapacke_dsbevx.c
|
||||
lapacke_dsbevx_work.c
|
||||
lapacke_dsbev_2stage.c
|
||||
lapacke_dsbev_2stage_work.c
|
||||
lapacke_dsbevd_2stage.c
|
||||
lapacke_dsbevd_2stage_work.c
|
||||
lapacke_dsbevx_2stage.c
|
||||
lapacke_dsbevx_2stage_work.c
|
||||
lapacke_dsbgst.c
|
||||
lapacke_dsbgst_work.c
|
||||
lapacke_dsbgv.c
|
||||
|
@ -877,6 +985,8 @@ set(DSRC
|
|||
lapacke_dstevx_work.c
|
||||
lapacke_dsycon.c
|
||||
lapacke_dsycon_work.c
|
||||
lapacke_dsycon_3.c
|
||||
lapacke_dsycon_3_work.c
|
||||
lapacke_dsyconv.c
|
||||
lapacke_dsyconv_work.c
|
||||
lapacke_dsyequb.c
|
||||
|
@ -889,10 +999,20 @@ set(DSRC
|
|||
lapacke_dsyevr_work.c
|
||||
lapacke_dsyevx.c
|
||||
lapacke_dsyevx_work.c
|
||||
lapacke_dsyev_2stage.c
|
||||
lapacke_dsyev_2stage_work.c
|
||||
lapacke_dsyevd_2stage.c
|
||||
lapacke_dsyevd_2stage_work.c
|
||||
lapacke_dsyevr_2stage.c
|
||||
lapacke_dsyevr_2stage_work.c
|
||||
lapacke_dsyevx_2stage.c
|
||||
lapacke_dsyevx_2stage_work.c
|
||||
lapacke_dsygst.c
|
||||
lapacke_dsygst_work.c
|
||||
lapacke_dsygv.c
|
||||
lapacke_dsygv_work.c
|
||||
lapacke_dsygv_2stage.c
|
||||
lapacke_dsygv_2stage_work.c
|
||||
lapacke_dsygvd.c
|
||||
lapacke_dsygvd_work.c
|
||||
lapacke_dsygvx.c
|
||||
|
@ -903,6 +1023,10 @@ set(DSRC
|
|||
lapacke_dsysv_rook.c
|
||||
lapacke_dsysv_rook_work.c
|
||||
lapacke_dsysv_work.c
|
||||
lapacke_dsysv_aa.c
|
||||
lapacke_dsysv_aa_work.c
|
||||
lapacke_dsysv_rk.c
|
||||
lapacke_dsysv_rk_work.c
|
||||
lapacke_dsysvx.c
|
||||
lapacke_dsysvx_work.c
|
||||
lapacke_dsyswapr.c
|
||||
|
@ -911,16 +1035,30 @@ set(DSRC
|
|||
lapacke_dsytrd_work.c
|
||||
lapacke_dsytrf.c
|
||||
lapacke_dsytrf_work.c
|
||||
lapacke_dsytrf_rook.c
|
||||
lapacke_dsytrf_rook_work.c
|
||||
lapacke_dsytrf_aa.c
|
||||
lapacke_dsytrf_aa_work.c
|
||||
lapacke_dsytrf_rk.c
|
||||
lapacke_dsytrf_rk_work.c
|
||||
lapacke_dsytri.c
|
||||
lapacke_dsytri2.c
|
||||
lapacke_dsytri2_work.c
|
||||
lapacke_dsytri_3.c
|
||||
lapacke_dsytri_3_work.c
|
||||
lapacke_dsytri2x.c
|
||||
lapacke_dsytri2x_work.c
|
||||
lapacke_dsytri_work.c
|
||||
lapacke_dsytrs.c
|
||||
lapacke_dsytrs_rook.c
|
||||
lapacke_dsytrs2.c
|
||||
lapacke_dsytrs2_work.c
|
||||
lapacke_dsytrs_aa.c
|
||||
lapacke_dsytrs_aa_work.c
|
||||
lapacke_dsytrs_3.c
|
||||
lapacke_dsytrs_3_work.c
|
||||
lapacke_dsytrs_work.c
|
||||
lapacke_dsytrs_rook_work.c
|
||||
lapacke_dtbcon.c
|
||||
lapacke_dtbcon_work.c
|
||||
lapacke_dtbrfs.c
|
||||
|
@ -998,6 +1136,8 @@ set(SSRC
|
|||
lapacke_sbbcsd_work.c
|
||||
lapacke_sbdsdc.c
|
||||
lapacke_sbdsdc_work.c
|
||||
lapacke_sbdsvdx.c
|
||||
lapacke_sbdsvdx_work.c
|
||||
lapacke_sbdsqr.c
|
||||
lapacke_sbdsqr_work.c
|
||||
lapacke_sdisna.c
|
||||
|
@ -1056,14 +1196,14 @@ set(SSRC
|
|||
lapacke_sgelss_work.c
|
||||
lapacke_sgelsy.c
|
||||
lapacke_sgelsy_work.c
|
||||
lapacke_sgemqr.c
|
||||
lapacke_sgemqr_work.c
|
||||
lapacke_sgemqrt.c
|
||||
lapacke_sgemqrt_work.c
|
||||
lapacke_sgeqlf.c
|
||||
lapacke_sgeqlf_work.c
|
||||
lapacke_sgeqp3.c
|
||||
lapacke_sgeqp3_work.c
|
||||
lapacke_sgeqpf.c
|
||||
lapacke_sgeqpf_work.c
|
||||
lapacke_sgeqr2.c
|
||||
lapacke_sgeqr2_work.c
|
||||
lapacke_sgeqrf.c
|
||||
|
@ -1086,6 +1226,8 @@ set(SSRC
|
|||
lapacke_sgesv_work.c
|
||||
lapacke_sgesvd.c
|
||||
lapacke_sgesvd_work.c
|
||||
lapacke_sgesvdx.c
|
||||
lapacke_sgesvdx_work.c
|
||||
lapacke_sgesvj.c
|
||||
lapacke_sgesvj_work.c
|
||||
lapacke_sgesvx.c
|
||||
|
@ -1094,36 +1236,46 @@ set(SSRC
|
|||
lapacke_sgetf2_work.c
|
||||
lapacke_sgetrf.c
|
||||
lapacke_sgetrf_work.c
|
||||
lapacke_sgetrf2.c
|
||||
lapacke_sgetrf2_work.c
|
||||
lapacke_sgetri.c
|
||||
lapacke_sgetri_work.c
|
||||
lapacke_sgetrs.c
|
||||
lapacke_sgetrs_work.c
|
||||
lapacke_sgetsls.c
|
||||
lapacke_sgetsls_work.c
|
||||
lapacke_sggbak.c
|
||||
lapacke_sggbak_work.c
|
||||
lapacke_sggbal.c
|
||||
lapacke_sggbal_work.c
|
||||
lapacke_sgges.c
|
||||
lapacke_sgges_work.c
|
||||
lapacke_sgges3.c
|
||||
lapacke_sgges3_work.c
|
||||
lapacke_sggesx.c
|
||||
lapacke_sggesx_work.c
|
||||
lapacke_sggev.c
|
||||
lapacke_sggev_work.c
|
||||
lapacke_sggev3.c
|
||||
lapacke_sggev3_work.c
|
||||
lapacke_sggevx.c
|
||||
lapacke_sggevx_work.c
|
||||
lapacke_sggglm.c
|
||||
lapacke_sggglm_work.c
|
||||
lapacke_sgghrd.c
|
||||
lapacke_sgghrd_work.c
|
||||
lapacke_sgghd3.c
|
||||
lapacke_sgghd3_work.c
|
||||
lapacke_sgglse.c
|
||||
lapacke_sgglse_work.c
|
||||
lapacke_sggqrf.c
|
||||
lapacke_sggqrf_work.c
|
||||
lapacke_sggrqf.c
|
||||
lapacke_sggrqf_work.c
|
||||
lapacke_sggsvd.c
|
||||
lapacke_sggsvd_work.c
|
||||
lapacke_sggsvp.c
|
||||
lapacke_sggsvp_work.c
|
||||
lapacke_sggsvd3.c
|
||||
lapacke_sggsvd3_work.c
|
||||
lapacke_sggsvp3.c
|
||||
lapacke_sggsvp3_work.c
|
||||
lapacke_sgtcon.c
|
||||
lapacke_sgtcon_work.c
|
||||
lapacke_sgtrfs.c
|
||||
|
@ -1158,6 +1310,8 @@ set(SSRC
|
|||
lapacke_slantr_work.c
|
||||
lapacke_slapmr.c
|
||||
lapacke_slapmr_work.c
|
||||
lapacke_slapmt.c
|
||||
lapacke_slapmt_work.c
|
||||
lapacke_slapy2.c
|
||||
lapacke_slapy2_work.c
|
||||
lapacke_slapy3.c
|
||||
|
@ -1176,6 +1330,8 @@ set(SSRC
|
|||
lapacke_slartgp_work.c
|
||||
lapacke_slartgs.c
|
||||
lapacke_slartgs_work.c
|
||||
lapacke_slascl.c
|
||||
lapacke_slascl_work.c
|
||||
lapacke_slaset.c
|
||||
lapacke_slaset_work.c
|
||||
lapacke_slasrt.c
|
||||
|
@ -1192,6 +1348,8 @@ set(SSRC
|
|||
lapacke_sorbdb_work.c
|
||||
lapacke_sorcsd.c
|
||||
lapacke_sorcsd_work.c
|
||||
lapacke_sorcsd2by1.c
|
||||
lapacke_sorcsd2by1_work.c
|
||||
lapacke_sorgbr.c
|
||||
lapacke_sorgbr_work.c
|
||||
lapacke_sorghr.c
|
||||
|
@ -1258,6 +1416,8 @@ set(SSRC
|
|||
lapacke_sposvx_work.c
|
||||
lapacke_spotrf.c
|
||||
lapacke_spotrf_work.c
|
||||
lapacke_spotrf2.c
|
||||
lapacke_spotrf2_work.c
|
||||
lapacke_spotri.c
|
||||
lapacke_spotri_work.c
|
||||
lapacke_spotrs.c
|
||||
|
@ -1300,6 +1460,12 @@ set(SSRC
|
|||
lapacke_ssbevd_work.c
|
||||
lapacke_ssbevx.c
|
||||
lapacke_ssbevx_work.c
|
||||
lapacke_ssbev_2stage.c
|
||||
lapacke_ssbev_2stage_work.c
|
||||
lapacke_ssbevd_2stage.c
|
||||
lapacke_ssbevd_2stage_work.c
|
||||
lapacke_ssbevx_2stage.c
|
||||
lapacke_ssbevx_2stage_work.c
|
||||
lapacke_ssbgst.c
|
||||
lapacke_ssbgst_work.c
|
||||
lapacke_ssbgv.c
|
||||
|
@ -1366,6 +1532,8 @@ set(SSRC
|
|||
lapacke_sstevx_work.c
|
||||
lapacke_ssycon.c
|
||||
lapacke_ssycon_work.c
|
||||
lapacke_ssycon_3.c
|
||||
lapacke_ssycon_3_work.c
|
||||
lapacke_ssyconv.c
|
||||
lapacke_ssyconv_work.c
|
||||
lapacke_ssyequb.c
|
||||
|
@ -1378,10 +1546,20 @@ set(SSRC
|
|||
lapacke_ssyevr_work.c
|
||||
lapacke_ssyevx.c
|
||||
lapacke_ssyevx_work.c
|
||||
lapacke_ssyev_2stage.c
|
||||
lapacke_ssyev_2stage_work.c
|
||||
lapacke_ssyevd_2stage.c
|
||||
lapacke_ssyevd_2stage_work.c
|
||||
lapacke_ssyevr_2stage.c
|
||||
lapacke_ssyevr_2stage_work.c
|
||||
lapacke_ssyevx_2stage.c
|
||||
lapacke_ssyevx_2stage_work.c
|
||||
lapacke_ssygst.c
|
||||
lapacke_ssygst_work.c
|
||||
lapacke_ssygv.c
|
||||
lapacke_ssygv_work.c
|
||||
lapacke_ssygv_2stage.c
|
||||
lapacke_ssygv_2stage_work.c
|
||||
lapacke_ssygvd.c
|
||||
lapacke_ssygvd_work.c
|
||||
lapacke_ssygvx.c
|
||||
|
@ -1392,6 +1570,10 @@ set(SSRC
|
|||
lapacke_ssysv_rook.c
|
||||
lapacke_ssysv_rook_work.c
|
||||
lapacke_ssysv_work.c
|
||||
lapacke_ssysv_aa.c
|
||||
lapacke_ssysv_aa_work.c
|
||||
lapacke_ssysv_rk.c
|
||||
lapacke_ssysv_rk_work.c
|
||||
lapacke_ssysvx.c
|
||||
lapacke_ssysvx_work.c
|
||||
lapacke_ssyswapr.c
|
||||
|
@ -1400,16 +1582,30 @@ set(SSRC
|
|||
lapacke_ssytrd_work.c
|
||||
lapacke_ssytrf.c
|
||||
lapacke_ssytrf_work.c
|
||||
lapacke_ssytrf_rook.c
|
||||
lapacke_ssytrf_rook_work.c
|
||||
lapacke_ssytrf_aa.c
|
||||
lapacke_ssytrf_aa_work.c
|
||||
lapacke_ssytrf_rk.c
|
||||
lapacke_ssytrf_rk_work.c
|
||||
lapacke_ssytri.c
|
||||
lapacke_ssytri2.c
|
||||
lapacke_ssytri2_work.c
|
||||
lapacke_ssytri_3.c
|
||||
lapacke_ssytri_3_work.c
|
||||
lapacke_ssytri2x.c
|
||||
lapacke_ssytri2x_work.c
|
||||
lapacke_ssytri_work.c
|
||||
lapacke_ssytrs.c
|
||||
lapacke_ssytrs_rook.c
|
||||
lapacke_ssytrs2.c
|
||||
lapacke_ssytrs2_work.c
|
||||
lapacke_ssytrs_aa.c
|
||||
lapacke_ssytrs_aa_work.c
|
||||
lapacke_ssytrs_3.c
|
||||
lapacke_ssytrs_3_work.c
|
||||
lapacke_ssytrs_work.c
|
||||
lapacke_ssytrs_rook_work.c
|
||||
lapacke_stbcon.c
|
||||
lapacke_stbcon_work.c
|
||||
lapacke_stbrfs.c
|
||||
|
@ -1440,6 +1636,8 @@ set(SSRC
|
|||
lapacke_stpcon_work.c
|
||||
lapacke_stpmqrt.c
|
||||
lapacke_stpmqrt_work.c
|
||||
lapacke_stpqrt.c
|
||||
lapacke_stpqrt_work.c
|
||||
lapacke_stpqrt2.c
|
||||
lapacke_stpqrt2_work.c
|
||||
lapacke_stprfb.c
|
||||
|
@ -1529,6 +1727,8 @@ set(ZSRC
|
|||
lapacke_zgeevx_work.c
|
||||
lapacke_zgehrd.c
|
||||
lapacke_zgehrd_work.c
|
||||
lapacke_zgejsv.c
|
||||
lapacke_zgejsv_work.c
|
||||
lapacke_zgelq2.c
|
||||
lapacke_zgelq2_work.c
|
||||
lapacke_zgelqf.c
|
||||
|
@ -1541,14 +1741,14 @@ set(ZSRC
|
|||
lapacke_zgelss_work.c
|
||||
lapacke_zgelsy.c
|
||||
lapacke_zgelsy_work.c
|
||||
lapacke_zgemqr.c
|
||||
lapacke_zgemqr_work.c
|
||||
lapacke_zgemqrt.c
|
||||
lapacke_zgemqrt_work.c
|
||||
lapacke_zgeqlf.c
|
||||
lapacke_zgeqlf_work.c
|
||||
lapacke_zgeqp3.c
|
||||
lapacke_zgeqp3_work.c
|
||||
lapacke_zgeqpf.c
|
||||
lapacke_zgeqpf_work.c
|
||||
lapacke_zgeqr2.c
|
||||
lapacke_zgeqr2_work.c
|
||||
lapacke_zgeqrf.c
|
||||
|
@ -1571,42 +1771,56 @@ set(ZSRC
|
|||
lapacke_zgesv_work.c
|
||||
lapacke_zgesvd.c
|
||||
lapacke_zgesvd_work.c
|
||||
lapacke_zgesvdx.c
|
||||
lapacke_zgesvdx_work.c
|
||||
lapacke_zgesvj.c
|
||||
lapacke_zgesvj_work.c
|
||||
lapacke_zgesvx.c
|
||||
lapacke_zgesvx_work.c
|
||||
lapacke_zgetf2.c
|
||||
lapacke_zgetf2_work.c
|
||||
lapacke_zgetrf.c
|
||||
lapacke_zgetrf_work.c
|
||||
lapacke_zgetrf2.c
|
||||
lapacke_zgetrf2_work.c
|
||||
lapacke_zgetri.c
|
||||
lapacke_zgetri_work.c
|
||||
lapacke_zgetrs.c
|
||||
lapacke_zgetrs_work.c
|
||||
lapacke_zgetsls.c
|
||||
lapacke_zgetsls_work.c
|
||||
lapacke_zggbak.c
|
||||
lapacke_zggbak_work.c
|
||||
lapacke_zggbal.c
|
||||
lapacke_zggbal_work.c
|
||||
lapacke_zgges.c
|
||||
lapacke_zgges_work.c
|
||||
lapacke_zgges3.c
|
||||
lapacke_zgges3_work.c
|
||||
lapacke_zggesx.c
|
||||
lapacke_zggesx_work.c
|
||||
lapacke_zggev.c
|
||||
lapacke_zggev_work.c
|
||||
lapacke_zggev3.c
|
||||
lapacke_zggev3_work.c
|
||||
lapacke_zggevx.c
|
||||
lapacke_zggevx_work.c
|
||||
lapacke_zggglm.c
|
||||
lapacke_zggglm_work.c
|
||||
lapacke_zgghrd.c
|
||||
lapacke_zgghrd_work.c
|
||||
lapacke_zgghd3.c
|
||||
lapacke_zgghd3_work.c
|
||||
lapacke_zgglse.c
|
||||
lapacke_zgglse_work.c
|
||||
lapacke_zggqrf.c
|
||||
lapacke_zggqrf_work.c
|
||||
lapacke_zggrqf.c
|
||||
lapacke_zggrqf_work.c
|
||||
lapacke_zggsvd.c
|
||||
lapacke_zggsvd_work.c
|
||||
lapacke_zggsvp.c
|
||||
lapacke_zggsvp_work.c
|
||||
lapacke_zggsvd3.c
|
||||
lapacke_zggsvd3_work.c
|
||||
lapacke_zggsvp3.c
|
||||
lapacke_zggsvp3_work.c
|
||||
lapacke_zgtcon.c
|
||||
lapacke_zgtcon_work.c
|
||||
lapacke_zgtrfs.c
|
||||
|
@ -1637,6 +1851,8 @@ set(ZSRC
|
|||
lapacke_zhbtrd_work.c
|
||||
lapacke_zhecon.c
|
||||
lapacke_zhecon_work.c
|
||||
lapacke_zhecon_3.c
|
||||
lapacke_zhecon_3_work.c
|
||||
lapacke_zheequb.c
|
||||
lapacke_zheequb_work.c
|
||||
lapacke_zheev.c
|
||||
|
@ -1647,10 +1863,20 @@ set(ZSRC
|
|||
lapacke_zheevr_work.c
|
||||
lapacke_zheevx.c
|
||||
lapacke_zheevx_work.c
|
||||
lapacke_zheev_2stage.c
|
||||
lapacke_zheev_2stage_work.c
|
||||
lapacke_zheevd_2stage.c
|
||||
lapacke_zheevd_2stage_work.c
|
||||
lapacke_zheevr_2stage.c
|
||||
lapacke_zheevr_2stage_work.c
|
||||
lapacke_zheevx_2stage.c
|
||||
lapacke_zheevx_2stage_work.c
|
||||
lapacke_zhegst.c
|
||||
lapacke_zhegst_work.c
|
||||
lapacke_zhegv.c
|
||||
lapacke_zhegv_work.c
|
||||
lapacke_zhegv_2stage.c
|
||||
lapacke_zhegv_2stage_work.c
|
||||
lapacke_zhegvd.c
|
||||
lapacke_zhegvd_work.c
|
||||
lapacke_zhegvx.c
|
||||
|
@ -1659,6 +1885,10 @@ set(ZSRC
|
|||
lapacke_zherfs_work.c
|
||||
lapacke_zhesv.c
|
||||
lapacke_zhesv_work.c
|
||||
lapacke_zhesv_aa.c
|
||||
lapacke_zhesv_aa_work.c
|
||||
lapacke_zhesv_rk.c
|
||||
lapacke_zhesv_rk_work.c
|
||||
lapacke_zhesvx.c
|
||||
lapacke_zhesvx_work.c
|
||||
lapacke_zheswapr.c
|
||||
|
@ -1666,17 +1896,31 @@ set(ZSRC
|
|||
lapacke_zhetrd.c
|
||||
lapacke_zhetrd_work.c
|
||||
lapacke_zhetrf.c
|
||||
lapacke_zhetrf_rook.c
|
||||
lapacke_zhetrf_work.c
|
||||
lapacke_zhetrf_rook_work.c
|
||||
lapacke_zhetrf_aa.c
|
||||
lapacke_zhetrf_aa_work.c
|
||||
lapacke_zhetrf_rk.c
|
||||
lapacke_zhetrf_rk_work.c
|
||||
lapacke_zhetri.c
|
||||
lapacke_zhetri2.c
|
||||
lapacke_zhetri2_work.c
|
||||
lapacke_zhetri_3.c
|
||||
lapacke_zhetri_3_work.c
|
||||
lapacke_zhetri2x.c
|
||||
lapacke_zhetri2x_work.c
|
||||
lapacke_zhetri_work.c
|
||||
lapacke_zhetrs.c
|
||||
lapacke_zhetrs_rook.c
|
||||
lapacke_zhetrs2.c
|
||||
lapacke_zhetrs2_work.c
|
||||
lapacke_zhetrs_work.c
|
||||
lapacke_zhetrs_aa.c
|
||||
lapacke_zhetrs_aa_work.c
|
||||
lapacke_zhetrs_3.c
|
||||
lapacke_zhetrs_3_work.c
|
||||
lapacke_zhetrs_rook_work.c
|
||||
lapacke_zhfrk.c
|
||||
lapacke_zhfrk_work.c
|
||||
lapacke_zhgeqz.c
|
||||
|
@ -1735,6 +1979,8 @@ set(ZSRC
|
|||
lapacke_zlantr_work.c
|
||||
lapacke_zlapmr.c
|
||||
lapacke_zlapmr_work.c
|
||||
lapacke_zlapmt.c
|
||||
lapacke_zlapmt_work.c
|
||||
lapacke_zlarfb.c
|
||||
lapacke_zlarfb_work.c
|
||||
lapacke_zlarfg.c
|
||||
|
@ -1745,6 +1991,8 @@ set(ZSRC
|
|||
lapacke_zlarfx_work.c
|
||||
lapacke_zlarnv.c
|
||||
lapacke_zlarnv_work.c
|
||||
lapacke_zlascl.c
|
||||
lapacke_zlascl_work.c
|
||||
lapacke_zlaset.c
|
||||
lapacke_zlaset_work.c
|
||||
lapacke_zlaswp.c
|
||||
|
@ -1787,6 +2035,8 @@ set(ZSRC
|
|||
lapacke_zposvx_work.c
|
||||
lapacke_zpotrf.c
|
||||
lapacke_zpotrf_work.c
|
||||
lapacke_zpotrf2.c
|
||||
lapacke_zpotrf2_work.c
|
||||
lapacke_zpotri.c
|
||||
lapacke_zpotri_work.c
|
||||
lapacke_zpotrs.c
|
||||
|
@ -1849,6 +2099,8 @@ set(ZSRC
|
|||
lapacke_zsteqr_work.c
|
||||
lapacke_zsycon.c
|
||||
lapacke_zsycon_work.c
|
||||
lapacke_zsycon_3.c
|
||||
lapacke_zsycon_3_work.c
|
||||
lapacke_zsyconv.c
|
||||
lapacke_zsyconv_work.c
|
||||
lapacke_zsyequb.c
|
||||
|
@ -1859,22 +2111,40 @@ set(ZSRC
|
|||
lapacke_zsysv_rook.c
|
||||
lapacke_zsysv_rook_work.c
|
||||
lapacke_zsysv_work.c
|
||||
lapacke_zsysv_aa.c
|
||||
lapacke_zsysv_aa_work.c
|
||||
lapacke_zsysv_rk.c
|
||||
lapacke_zsysv_rk_work.c
|
||||
lapacke_zsysvx.c
|
||||
lapacke_zsysvx_work.c
|
||||
lapacke_zsyswapr.c
|
||||
lapacke_zsyswapr_work.c
|
||||
lapacke_zsytrf.c
|
||||
lapacke_zsytrf_work.c
|
||||
lapacke_zsytrf_rook.c
|
||||
lapacke_zsytrf_rook_work.c
|
||||
lapacke_zsytrf_aa.c
|
||||
lapacke_zsytrf_aa_work.c
|
||||
lapacke_zsytrf_rk.c
|
||||
lapacke_zsytrf_rk_work.c
|
||||
lapacke_zsytri.c
|
||||
lapacke_zsytri2.c
|
||||
lapacke_zsytri2_work.c
|
||||
lapacke_zsytri_3.c
|
||||
lapacke_zsytri_3_work.c
|
||||
lapacke_zsytri2x.c
|
||||
lapacke_zsytri2x_work.c
|
||||
lapacke_zsytri_work.c
|
||||
lapacke_zsytrs.c
|
||||
lapacke_zsytrs_rook.c
|
||||
lapacke_zsytrs2.c
|
||||
lapacke_zsytrs2_work.c
|
||||
lapacke_zsytrs_work.c
|
||||
lapacke_zsytrs_rook_work.c
|
||||
lapacke_zsytrs_aa.c
|
||||
lapacke_zsytrs_aa_work.c
|
||||
lapacke_zsytrs_3.c
|
||||
lapacke_zsytrs_3_work.c
|
||||
lapacke_ztbcon.c
|
||||
lapacke_ztbcon_work.c
|
||||
lapacke_ztbrfs.c
|
||||
|
@ -1949,6 +2219,8 @@ set(ZSRC
|
|||
lapacke_zunbdb_work.c
|
||||
lapacke_zuncsd.c
|
||||
lapacke_zuncsd_work.c
|
||||
lapacke_zuncsd2by1.c
|
||||
lapacke_zuncsd2by1_work.c
|
||||
lapacke_zungbr.c
|
||||
lapacke_zungbr_work.c
|
||||
lapacke_zunghr.c
|
||||
|
@ -2119,6 +2391,6 @@ foreach (Utils_FILE ${Utils_SRC})
|
|||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h")
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: -L${libdir} -lopenblas
|
||||
Cflags: -I${includedir}
|
|
@ -77,7 +77,7 @@ if (CYGWIN)
|
|||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
|
||||
if (SMP)
|
||||
set(EXTRALIB "${EXTRALIB} -lpthread")
|
||||
endif ()
|
||||
|
|
|
@ -4,7 +4,8 @@
|
|||
## This is triggered by system.cmake and runs before any of the code is built.
|
||||
## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files).
|
||||
## Next it runs f_check and appends some fortran information to the files.
|
||||
## Finally it runs getarch and getarch_2nd for even more environment information.
|
||||
## Then it runs getarch and getarch_2nd for even more environment information.
|
||||
## Finally it builds gen_config_h for use at build time to generate config.h.
|
||||
|
||||
# CMake vars set by this file:
|
||||
# CORE
|
||||
|
@ -71,9 +72,15 @@ if (MSVC)
|
|||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
endif()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
# disable WindowsStore strict CRT checks
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
|
||||
endif ()
|
||||
|
||||
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
|
||||
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
|
@ -81,6 +88,10 @@ try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
|||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
message(STATUS "Running getarch")
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
|
@ -96,6 +107,7 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
|
|||
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
||||
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
|
@ -103,6 +115,11 @@ try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
|||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH2_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT)
|
||||
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
|
||||
|
@ -111,3 +128,21 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE
|
|||
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT})
|
||||
ParseGetArchVars(${GETARCH2_MAKE_OUT})
|
||||
|
||||
# compile get_config_h
|
||||
set(GEN_CONFIG_H_DIR "${PROJECT_BINARY_DIR}/genconfig_h_build")
|
||||
set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
|
||||
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
|
||||
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GEN_CONFIG_H_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
|
||||
endif ()
|
||||
endif ()
|
|
@ -22,7 +22,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
|||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER")
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
endif ()
|
||||
|
@ -312,6 +312,8 @@ endif ()
|
|||
|
||||
set(AWK awk)
|
||||
|
||||
set(SED sed)
|
||||
|
||||
set(REVISION "-r${OpenBLAS_VERSION}")
|
||||
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
|
||||
|
||||
|
|
22
common.h
22
common.h
|
@ -420,7 +420,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_arm64.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_ZARCH
|
||||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) 0
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
|
@ -428,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
|
|||
typedef char* env_var_t;
|
||||
#define readenv(p, n) ((p)=getenv(n))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
|
||||
#ifdef _POSIX_MONOTONIC_CLOCK
|
||||
|
@ -552,8 +561,13 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
#endif
|
||||
|
||||
#if defined(C_PGI) || defined(C_SUN)
|
||||
#if defined(__STDC_IEC_559_COMPLEX__)
|
||||
#define CREAL(X) creal(X)
|
||||
#define CIMAG(X) cimag(X)
|
||||
#else
|
||||
#define CREAL(X) (*((FLOAT *)&X + 0))
|
||||
#define CIMAG(X) (*((FLOAT *)&X + 1))
|
||||
#endif
|
||||
#else
|
||||
#ifdef OPENBLAS_COMPLEX_STRUCT
|
||||
#define CREAL(Z) ((Z).real)
|
||||
|
@ -645,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
|
|||
*address = 0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
static __inline int readenv_atoi(char *env) {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
static __inline int readenv_atoi(char *env) {
|
||||
env_var_t p;
|
||||
|
@ -660,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
|
|||
return(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
|
||||
|
||||
|
|
|
@ -105,7 +105,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define PROLOGUE \
|
||||
.arm ;\
|
||||
.global REALNAME ;\
|
||||
.func REALNAME ;\
|
||||
REALNAME:
|
||||
|
||||
#define EPILOGUE
|
||||
|
|
|
@ -39,7 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
#define RETURN_BY_COMPLEX
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ extern long int syscall (long int __sysno, ...);
|
|||
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
unsigned long *nodemask, unsigned long maxnode,
|
||||
unsigned flags) {
|
||||
#if defined (__LSB_VERSION__)
|
||||
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
|
||||
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
|
@ -90,7 +90,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
|
|||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
#if defined (__LSB_VERSION__)
|
||||
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
|
||||
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
|
|
|
@ -2193,7 +2193,7 @@
|
|||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sgemm_p;
|
||||
|
|
|
@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef COMMON_MIPS
|
||||
#define COMMON_MIPS
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
|
@ -42,11 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
}
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
|
||||
|
@ -80,7 +75,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define PROLOGUE \
|
||||
.arm ;\
|
||||
.global REALNAME ;\
|
||||
.func REALNAME ;\
|
||||
REALNAME:
|
||||
|
||||
#define EPILOGUE
|
||||
|
|
|
@ -71,35 +71,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef COMMON_MIPS64
|
||||
#define COMMON_MIPS64
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
long int ret, val = 1;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1: ll %0, %3\n"
|
||||
" ori %2, %0, 1\n"
|
||||
" sc %2, %1\n"
|
||||
" beqz %2, 1b\n"
|
||||
" andi %2, %0, 1\n"
|
||||
" sync\n"
|
||||
: "=&r" (val), "=m" (address), "=&r" (ret)
|
||||
: "m" (address)
|
||||
: "memory");
|
||||
|
||||
} while (ret);
|
||||
}
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
|
||||
|
|
|
@ -245,6 +245,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
#define RETURN_BY_STACK
|
||||
#endif
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#define RETURN_BY_STACK
|
||||
#endif
|
||||
|
||||
#ifdef F_INTERFACE_PGI
|
||||
#define RETURN_BY_STACK
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_ZARCH
|
||||
#define COMMON_ZARCH
|
||||
|
||||
#define MB
|
||||
//__asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB
|
||||
//__asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
/*
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"mov x4, #1 \n\t"
|
||||
"1: \n\t"
|
||||
"ldaxr x2, [%1] \n\t"
|
||||
"cbnz x2, 1b \n\t"
|
||||
"2: \n\t"
|
||||
"stxr w3, x4, [%1] \n\t"
|
||||
"cbnz w3, 1b \n\t"
|
||||
"mov %0, #0 \n\t"
|
||||
: "=r"(ret), "=r"(address)
|
||||
: "1"(address)
|
||||
: "memory", "x2" , "x3", "x4"
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
} while (ret);
|
||||
|
||||
}
|
||||
*/
|
||||
//#define BLAS_LOCK_DEFINED
|
||||
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 256 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
REALNAME:
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE ( 4 << 10)
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#if defined(CORTEXA57)
|
||||
#define BUFFER_SIZE (20 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
#endif
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
2
cpuid.h
2
cpuid.h
|
@ -114,6 +114,7 @@
|
|||
#define CORE_HASWELL 24
|
||||
#define CORE_STEAMROLLER 25
|
||||
#define CORE_EXCAVATOR 26
|
||||
#define CORE_ZEN 27
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
|
@ -209,5 +210,6 @@ typedef struct {
|
|||
#define CPUTYPE_HASWELL 48
|
||||
#define CPUTYPE_STEAMROLLER 49
|
||||
#define CPUTYPE_EXCAVATOR 50
|
||||
#define CPUTYPE_ZEN 51
|
||||
|
||||
#endif
|
||||
|
|
|
@ -74,7 +74,7 @@ int get_feature(char *search)
|
|||
fclose(infile);
|
||||
|
||||
|
||||
if( p == NULL ) return;
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
|
|
|
@ -30,17 +30,26 @@
|
|||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV8 1
|
||||
#define CPU_CORTEXA57 2
|
||||
#define CPU_VULCAN 3
|
||||
#define CPU_THUNDERX 4
|
||||
#define CPU_THUNDERX2T99 5
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"ARMV8" ,
|
||||
"CORTEXA57"
|
||||
"CORTEXA57",
|
||||
"VULCAN",
|
||||
"THUNDERX",
|
||||
"THUNDERX2T99"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"unknown",
|
||||
"armv8" ,
|
||||
"cortexa57"
|
||||
"cortexa57",
|
||||
"vulcan",
|
||||
"thunderx",
|
||||
"thunderx2t99"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -85,25 +94,34 @@ int detect(void)
|
|||
#ifdef linux
|
||||
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("CPU part", buffer, 8))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
||||
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
|
||||
cpu_part = strchr(buffer, ':') + 2;
|
||||
cpu_part = strdup(cpu_part);
|
||||
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
|
||||
cpu_implementer = strchr(buffer, ':') + 2;
|
||||
cpu_implementer = strdup(cpu_implementer);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
if(p != NULL) {
|
||||
if (strstr(p, "0xd07")) {
|
||||
if(cpu_part != NULL && cpu_implementer != NULL) {
|
||||
if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41"))
|
||||
return CPU_CORTEXA57;
|
||||
}
|
||||
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
|
||||
return CPU_VULCAN;
|
||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
|
||||
return CPU_THUNDERX2T99;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
|
@ -176,6 +194,28 @@ void get_cpuconfig(void)
|
|||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_VULCAN:
|
||||
printf("#define VULCAN \n");
|
||||
printf("#define HAVE_VFP \n");
|
||||
printf("#define HAVE_VFPV3 \n");
|
||||
printf("#define HAVE_NEON \n");
|
||||
printf("#define HAVE_VFPV4 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
printf("#define CORTEXA57\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
|
@ -194,6 +234,40 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 16777216\n");
|
||||
printf("#define L2_LINESIZE 128\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define VULCAN \n");
|
||||
printf("#define HAVE_VFP \n");
|
||||
printf("#define HAVE_VFPV3 \n");
|
||||
printf("#define HAVE_NEON \n");
|
||||
printf("#define HAVE_VFPV4 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
123
cpuid_x86.c
123
cpuid_x86.c
|
@ -636,6 +636,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
LD1.associative = 8;
|
||||
LD1.linesize = 64;
|
||||
break;
|
||||
case 0x63 :
|
||||
DTB.size = 2048;
|
||||
DTB.associative = 4;
|
||||
DTB.linesize = 32;
|
||||
LDTB.size = 4096;
|
||||
LDTB.associative= 4;
|
||||
LDTB.linesize = 32;
|
||||
case 0x66 :
|
||||
LD1.size = 8;
|
||||
LD1.associative = 4;
|
||||
|
@ -667,6 +674,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
LC1.size = 64;
|
||||
LC1.associative = 8;
|
||||
break;
|
||||
case 0x76 :
|
||||
ITB.size = 2048;
|
||||
ITB.associative = 0;
|
||||
ITB.linesize = 8;
|
||||
LITB.size = 4096;
|
||||
LITB.associative= 0;
|
||||
LITB.linesize = 8;
|
||||
case 0x77 :
|
||||
LC1.size = 16;
|
||||
LC1.associative = 4;
|
||||
|
@ -1110,6 +1124,9 @@ int get_cpuname(void){
|
|||
break;
|
||||
case 3:
|
||||
switch (model) {
|
||||
case 7:
|
||||
// Bay Trail
|
||||
return CPUTYPE_ATOM;
|
||||
case 10:
|
||||
case 14:
|
||||
// Ivy Bridge
|
||||
|
@ -1199,6 +1216,33 @@ int get_cpuname(void){
|
|||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7:
|
||||
// Xeon Phi Knights Landing
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
// Apollo Lake
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 14: // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
|
@ -1235,8 +1279,11 @@ int get_cpuname(void){
|
|||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 3:
|
||||
case 7:
|
||||
case 10:
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 5:
|
||||
return CPUTYPE_BOBCAT;
|
||||
case 6:
|
||||
switch (model) {
|
||||
case 1:
|
||||
|
@ -1251,7 +1298,13 @@ int get_cpuname(void){
|
|||
return CPUTYPE_PILEDRIVER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 5: // New EXCAVATOR CPUS
|
||||
if(support_avx())
|
||||
return CPUTYPE_EXCAVATOR;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 0:
|
||||
case 8:
|
||||
switch(exmodel){
|
||||
case 1: //AMD Trinity
|
||||
if(support_avx())
|
||||
|
@ -1273,8 +1326,19 @@ int get_cpuname(void){
|
|||
break;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
return CPUTYPE_BOBCAT;
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1401,6 +1465,7 @@ static char *cpuname[] = {
|
|||
"HASWELL",
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
|
@ -1454,6 +1519,7 @@ static char *lowercpuname[] = {
|
|||
"haswell",
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
|
@ -1484,6 +1550,7 @@ static char *corename[] = {
|
|||
"HASWELL",
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
|
@ -1514,6 +1581,7 @@ static char *corename_lower[] = {
|
|||
"haswell",
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
};
|
||||
|
||||
|
||||
|
@ -1710,8 +1778,33 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:
|
||||
// Phi Knights Landing
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 12:
|
||||
// Apollo Lake
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14) { // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -1741,8 +1834,13 @@ int get_coretype(void){
|
|||
return CORE_PILEDRIVER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
|
||||
case 5: // New EXCAVATOR
|
||||
if(support_avx())
|
||||
return CORE_EXCAVATOR;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
case 0:
|
||||
case 8:
|
||||
switch(exmodel){
|
||||
case 1: //AMD Trinity
|
||||
if(support_avx())
|
||||
|
@ -1764,9 +1862,22 @@ int get_coretype(void){
|
|||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
}else return CORE_BARCELONA;
|
||||
} else if (exfamily == 8) {
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CORE_BARCELONA;
|
||||
}
|
||||
} else {
|
||||
return CORE_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
/**************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Type", buffer, 4)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
||||
char *get_corename(void)
|
||||
{
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void)
|
||||
{
|
||||
printf("ZARCH");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void)
|
||||
{
|
||||
int d = detect();
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void)
|
||||
{
|
||||
printf("zarch");
|
||||
}
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
switch (d){
|
||||
case CPU_GENERIC:
|
||||
printf("#define ZARCH_GENERIC\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z13:
|
||||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
4
ctest.c
4
ctest.c
|
@ -105,6 +105,10 @@ ARCH_X86_64
|
|||
ARCH_POWER
|
||||
#endif
|
||||
|
||||
#if defined(__s390x__) || defined(__zarch__)
|
||||
ARCH_ZARCH
|
||||
#endif
|
||||
|
||||
#ifdef __mips64
|
||||
ARCH_MIPS64
|
||||
#endif
|
||||
|
|
|
@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
|
|||
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
|
|
@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
|||
#endif
|
||||
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
blas_queue_t queue[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
|
||||
|
|
|
@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -316,7 +316,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
if (min_l > GEMM3M_Q) {
|
||||
min_l = (min_l + 1) / 2;
|
||||
#ifdef UNROLL_X
|
||||
min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
|
||||
min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -326,7 +326,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else {
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -365,7 +365,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
@ -386,7 +386,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else {
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -429,7 +429,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
@ -451,7 +451,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else {
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -494,7 +494,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
|
|
@ -297,9 +297,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_l = GEMM_Q;
|
||||
} else {
|
||||
if (min_l > GEMM_Q) {
|
||||
min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
|
||||
min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
|
||||
}
|
||||
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
|
||||
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
|
||||
while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
|
||||
}
|
||||
|
||||
|
@ -311,7 +311,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else {
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
|
||||
} else {
|
||||
l1stride = 0;
|
||||
}
|
||||
|
@ -369,7 +369,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
|
|
@ -365,7 +365,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
buffer[0] = sb;
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
|
||||
buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
|
||||
}
|
||||
|
||||
for(ls = 0; ls < k; ls += min_l){
|
||||
|
@ -384,7 +384,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else {
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -482,7 +482,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
@ -618,7 +618,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
@ -754,7 +754,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM3M_P;
|
||||
} else
|
||||
if (min_i > GEMM3M_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
|
|
@ -189,7 +189,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
|
@ -230,7 +230,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
|
||||
|
@ -245,7 +245,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
if (m_start >= js) {
|
||||
|
@ -284,7 +284,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
|
||||
|
@ -322,7 +322,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
aa = sb + min_l * (is - js) * COMPSIZE;
|
||||
|
@ -353,7 +353,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
aa = sb + min_l * (m_start - js) * COMPSIZE;
|
||||
|
@ -383,7 +383,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
aa = sb + min_l * (is - js) * COMPSIZE;
|
||||
|
|
|
@ -198,7 +198,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
|
@ -239,7 +239,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
aa = sb + min_l * (is - js) * COMPSIZE;
|
||||
|
@ -303,7 +303,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
@ -375,7 +375,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
if (is < js + min_j) {
|
||||
|
@ -460,7 +460,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
|
|
@ -210,8 +210,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to);
|
||||
#endif
|
||||
|
||||
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
|
||||
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
buffer[0] = sb;
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
|
@ -233,7 +232,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else {
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -253,8 +252,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
STOP_RPCC(copy_A);
|
||||
|
||||
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
|
||||
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
|
||||
|
||||
|
@ -353,8 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
while (current >= 0) {
|
||||
#endif
|
||||
|
||||
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
|
||||
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
|
@ -412,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
@ -425,8 +422,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
do {
|
||||
|
||||
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
|
||||
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
|
@ -602,9 +598,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
double di = (double)i;
|
||||
|
||||
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
|
||||
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) );
|
||||
|
||||
if (num_cpu == 0) width = n - ((n - width) & ~mask);
|
||||
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) );
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
|
@ -644,7 +640,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
double di = (double)i;
|
||||
|
||||
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
|
||||
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
|
|
|
@ -310,7 +310,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
buffer[0] = sb;
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
|
||||
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
|
||||
}
|
||||
|
||||
|
||||
|
@ -331,7 +331,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else {
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
|
||||
} else {
|
||||
if (args -> nthreads == 1) l1stride = 0;
|
||||
}
|
||||
|
@ -443,7 +443,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
|
||||
}
|
||||
|
||||
START_RPCC();
|
||||
|
|
|
@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
|
|||
|
||||
int mm, nn;
|
||||
|
||||
mm = (loop & ~(GEMM_UNROLL_MN - 1));
|
||||
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
nn = MIN(GEMM_UNROLL_MN, n - loop);
|
||||
|
||||
#ifndef LOWER
|
||||
|
|
|
@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)i;
|
||||
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
|
||||
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
|
@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)(arg -> n - i);
|
||||
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
|
||||
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
|
|
|
@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
|||
|
||||
int mm, nn;
|
||||
|
||||
mm = (loop & ~(GEMM_UNROLL_MN - 1));
|
||||
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
nn = MIN(GEMM_UNROLL_MN, n - loop);
|
||||
|
||||
#ifndef LOWER
|
||||
|
|
|
@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
|
|||
|
||||
int mm, nn;
|
||||
|
||||
mm = (loop & ~(GEMM_UNROLL_MN - 1));
|
||||
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
nn = MIN(GEMM_UNROLL_MN, n - loop);
|
||||
|
||||
#ifndef LOWER
|
||||
|
|
|
@ -12,6 +12,8 @@ if (SMP)
|
|||
set(BLAS_SERVER blas_server_omp.c)
|
||||
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
set(BLAS_SERVER blas_server_win32.c)
|
||||
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
|
||||
set(BLAS_SERVER blas_server_win32.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED BLAS_SERVER)
|
||||
|
|
|
@ -110,3 +110,74 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
|
||||
void *a, BLASLONG lda,
|
||||
void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(), int nthreads){
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
blas_arg_t args [MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG i, width, astride, bstride;
|
||||
int num_cpu, calc_type;
|
||||
|
||||
calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2;
|
||||
|
||||
mode |= BLAS_LEGACY;
|
||||
|
||||
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
|
||||
|
||||
num_cpu = 0;
|
||||
i = m;
|
||||
|
||||
while (i > 0){
|
||||
|
||||
/* Adjust Parameters */
|
||||
width = blas_quickdivide(i + nthreads - num_cpu - 1,
|
||||
nthreads - num_cpu);
|
||||
|
||||
i -= width;
|
||||
if (i < 0) width = width + i;
|
||||
|
||||
astride = width * lda;
|
||||
|
||||
if (!(mode & BLAS_TRANSB_T)) {
|
||||
bstride = width * ldb;
|
||||
} else {
|
||||
bstride = width;
|
||||
}
|
||||
|
||||
astride <<= calc_type;
|
||||
bstride <<= calc_type;
|
||||
|
||||
args[num_cpu].m = width;
|
||||
args[num_cpu].n = n;
|
||||
args[num_cpu].k = k;
|
||||
args[num_cpu].a = (void *)a;
|
||||
args[num_cpu].b = (void *)b;
|
||||
args[num_cpu].c = (void *)((char *)c + num_cpu * sizeof(double)*2);
|
||||
args[num_cpu].lda = lda;
|
||||
args[num_cpu].ldb = ldb;
|
||||
args[num_cpu].ldc = ldc;
|
||||
args[num_cpu].alpha = alpha;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = function;
|
||||
queue[num_cpu].args = &args[num_cpu];
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
|
||||
a = (void *)((BLASULONG)a + astride);
|
||||
b = (void *)((BLASULONG)b + bstride);
|
||||
|
||||
num_cpu ++;
|
||||
}
|
||||
|
||||
if (num_cpu) {
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
|
@ -276,6 +276,9 @@ static void* blas_thread_server(void *arg){
|
|||
unsigned int last_tick;
|
||||
void *buffer, *sa, *sb;
|
||||
blas_queue_t *queue;
|
||||
|
||||
blas_queue_t *tscq;
|
||||
|
||||
#ifdef TIMING_DEBUG
|
||||
unsigned long start, stop;
|
||||
#endif
|
||||
|
@ -309,8 +312,11 @@ static void* blas_thread_server(void *arg){
|
|||
|
||||
last_tick = (unsigned int)rpcc();
|
||||
|
||||
while (!thread_status[cpu].queue) {
|
||||
pthread_mutex_lock (&thread_status[cpu].lock);
|
||||
tscq=thread_status[cpu].queue;
|
||||
pthread_mutex_unlock (&thread_status[cpu].lock);
|
||||
|
||||
while(!tscq) {
|
||||
YIELDING;
|
||||
|
||||
if ((unsigned int)rpcc() - last_tick > thread_timeout) {
|
||||
|
@ -333,6 +339,9 @@ static void* blas_thread_server(void *arg){
|
|||
|
||||
last_tick = (unsigned int)rpcc();
|
||||
}
|
||||
pthread_mutex_lock (&thread_status[cpu].lock);
|
||||
tscq=thread_status[cpu].queue;
|
||||
pthread_mutex_unlock (&thread_status[cpu].lock);
|
||||
|
||||
}
|
||||
|
||||
|
@ -351,7 +360,9 @@ static void* blas_thread_server(void *arg){
|
|||
if (queue) {
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||
|
||||
pthread_mutex_lock (&thread_status[cpu].lock);
|
||||
thread_status[cpu].queue = (blas_queue_t *)1;
|
||||
pthread_mutex_unlock (&thread_status[cpu].lock);
|
||||
|
||||
sa = queue -> sa;
|
||||
sb = queue -> sb;
|
||||
|
@ -433,7 +444,10 @@ static void* blas_thread_server(void *arg){
|
|||
// thread is marked as done and other threads use them
|
||||
WMB;
|
||||
|
||||
pthread_mutex_lock (&thread_status[cpu].lock);
|
||||
thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
|
||||
pthread_mutex_unlock (&thread_status[cpu].lock);
|
||||
|
||||
WMB;
|
||||
|
||||
}
|
||||
|
@ -613,6 +627,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
#endif
|
||||
BLASLONG i = 0;
|
||||
blas_queue_t *current = queue;
|
||||
blas_queue_t *tsiq,*tspq;
|
||||
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
|
||||
int node = get_node();
|
||||
int nodes = get_num_nodes();
|
||||
|
@ -660,15 +675,23 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
}
|
||||
}
|
||||
#else
|
||||
while(thread_status[i].queue) {
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
tsiq=thread_status[i].queue ;
|
||||
pthread_mutex_unlock (&thread_status[i].lock);
|
||||
while(tsiq) {
|
||||
i ++;
|
||||
if (i >= blas_num_threads - 1) i = 0;
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
tsiq=thread_status[i].queue ;
|
||||
pthread_mutex_unlock (&thread_status[i].lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
queue -> assigned = i;
|
||||
WMB;
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
thread_status[i].queue = queue;
|
||||
pthread_mutex_unlock (&thread_status[i].lock);
|
||||
WMB;
|
||||
|
||||
queue = queue -> next;
|
||||
|
@ -689,11 +712,15 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
|
||||
pos = current -> assigned;
|
||||
|
||||
if ((BLASULONG)thread_status[pos].queue > 1) {
|
||||
pthread_mutex_lock (&thread_status[pos].lock);
|
||||
tspq=thread_status[pos].queue;
|
||||
pthread_mutex_unlock (&thread_status[pos].lock);
|
||||
|
||||
if ((BLASULONG)tspq > 1) {
|
||||
pthread_mutex_lock (&thread_status[pos].lock);
|
||||
|
||||
if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
|
||||
|
||||
pthread_mutex_lock (&thread_status[pos].lock);
|
||||
|
||||
#ifdef MONITOR
|
||||
num_suspend ++;
|
||||
|
@ -703,8 +730,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
thread_status[pos].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal(&thread_status[pos].wakeup);
|
||||
}
|
||||
pthread_mutex_unlock(&thread_status[pos].lock);
|
||||
|
||||
}
|
||||
pthread_mutex_unlock(&thread_status[pos].lock);
|
||||
}
|
||||
|
||||
current = current -> next;
|
||||
|
@ -714,11 +742,22 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
}
|
||||
|
||||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
||||
blas_queue_t * tsqq;
|
||||
|
||||
while ((num > 0) && queue) {
|
||||
|
||||
while(thread_status[queue -> assigned].queue) {
|
||||
pthread_mutex_lock(&thread_status[queue->assigned].lock);
|
||||
tsqq=thread_status[queue -> assigned].queue;
|
||||
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
|
||||
|
||||
|
||||
while(tsqq) {
|
||||
YIELDING;
|
||||
pthread_mutex_lock(&thread_status[queue->assigned].lock);
|
||||
tsqq=thread_status[queue -> assigned].queue;
|
||||
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
|
||||
|
||||
|
||||
};
|
||||
|
||||
queue = queue -> next;
|
||||
|
|
|
@ -444,7 +444,10 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
#endif
|
||||
}
|
||||
|
||||
blas_server_avail = 0;
|
||||
|
|
|
@ -70,8 +70,10 @@ extern gotoblas_t gotoblas_STEAMROLLER;
|
|||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
|
@ -81,6 +83,7 @@ extern gotoblas_t gotoblas_HASWELL;
|
|||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
|
||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -232,6 +235,7 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 7) return &gotoblas_ATOM; //Bay Trail
|
||||
return NULL;
|
||||
case 4:
|
||||
//Intel Haswell
|
||||
|
@ -263,7 +267,6 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
//Intel Braswell / Avoton
|
||||
if (model == 12 || model == 13) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
|
@ -286,6 +289,30 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Phi Knights Landing
|
||||
if (model == 7) {
|
||||
if(support_avx())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake
|
||||
if (model == 12) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
if(support_avx())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
|
@ -331,7 +358,14 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0){
|
||||
}else if(model == 5){
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0 || model == 8){
|
||||
if (exmodel == 1) {
|
||||
//AMD Trinity
|
||||
if(support_avx())
|
||||
|
@ -358,8 +392,15 @@ static gotoblas_t *get_coretype(void){
|
|||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
@ -370,7 +411,6 @@ static gotoblas_t *get_coretype(void){
|
|||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -401,6 +441,7 @@ static char *corename[] = {
|
|||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
@ -427,6 +468,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
|
||||
return corename[0];
|
||||
}
|
||||
|
@ -439,7 +481,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 22; i++)
|
||||
for ( i=1 ; i <= 23; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
|
@ -457,6 +499,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
|
||||
switch (found)
|
||||
{
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
case 20: return (&gotoblas_HASWELL);
|
||||
|
|
|
@ -354,6 +354,24 @@ static int numa_check(void) {
|
|||
return common -> num_nodes;
|
||||
}
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int sched_getcpu(void)
|
||||
{
|
||||
int cpu;
|
||||
FILE *fp = NULL;
|
||||
if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
|
||||
return -1;
|
||||
if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
|
||||
fclose (fp);
|
||||
return -1;
|
||||
}
|
||||
fclose (fp);
|
||||
return(cpu);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void numa_mapping(void) {
|
||||
|
||||
int node, cpu, core;
|
||||
|
@ -808,16 +826,54 @@ void gotoblas_affinity_init(void) {
|
|||
common -> shmid = pshmid;
|
||||
|
||||
if (common -> magic != SH_MAGIC) {
|
||||
cpu_set_t *cpusetp;
|
||||
int nums;
|
||||
int ret;
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "Shared Memory Initialization.\n");
|
||||
#endif
|
||||
|
||||
//returns the number of processors which are currently online
|
||||
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;
|
||||
|
||||
nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 3)
|
||||
common->num_procs = nums;
|
||||
#elif __GLIBC_PREREQ(2, 7)
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
common->num_procs = nums;
|
||||
} else {
|
||||
size_t size;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0)
|
||||
common->num_procs = nums;
|
||||
else
|
||||
common->num_procs = CPU_COUNT_S(size,cpusetp);
|
||||
}
|
||||
CPU_FREE(cpusetp);
|
||||
#else
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
if (ret!=0) {
|
||||
common->num_procs = nums;
|
||||
} else {
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int i;
|
||||
int n = 0;
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
common->num_procs = n;
|
||||
}
|
||||
#else
|
||||
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
if(common -> num_procs > MAX_CPUS) {
|
||||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
|
||||
fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
@ -923,7 +979,7 @@ void gotoblas_set_affinity2(int threads) {};
|
|||
|
||||
void gotoblas_affinity_reschedule(void) {};
|
||||
|
||||
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_ONLN); }
|
||||
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); }
|
||||
|
||||
int get_num_nodes(void) { return 1; }
|
||||
|
||||
|
|
|
@ -169,13 +169,50 @@ void goto_set_num_threads(int num_threads) {};
|
|||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#endif
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
nums = CPU_COUNT_S(size,cpusetp);
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
@ -184,7 +221,7 @@ int get_num_procs(void) {
|
|||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
@ -381,6 +418,16 @@ static int release_pos = 0;
|
|||
static int hot_alloc = 0;
|
||||
#endif
|
||||
|
||||
/* Global lock for memory allocation */
|
||||
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||
static pthread_spinlock_t alloc_lock = 0;
|
||||
#else
|
||||
static BLASULONG alloc_lock = 0UL;
|
||||
#endif
|
||||
|
||||
#ifdef ALLOC_MMAP
|
||||
|
||||
static void alloc_mmap_free(struct release_t *release){
|
||||
|
@ -390,6 +437,8 @@ static void alloc_mmap_free(struct release_t *release){
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef NO_WARMUP
|
||||
|
||||
static void *alloc_mmap(void *address){
|
||||
|
@ -406,9 +455,11 @@ static void *alloc_mmap(void *address){
|
|||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
|
@ -550,12 +601,14 @@ static void *alloc_mmap(void *address){
|
|||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
}
|
||||
#endif
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -889,15 +942,6 @@ static void *alloc_hugetlbfile(void *address){
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Global lock for memory allocation */
|
||||
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||
static pthread_spinlock_t alloc_lock = 0;
|
||||
#else
|
||||
static BLASULONG alloc_lock = 0UL;
|
||||
#endif
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
static BLASULONG base_address = 0UL;
|
||||
|
@ -963,9 +1007,6 @@ void *blas_memory_alloc(int procpos){
|
|||
NULL,
|
||||
};
|
||||
void *(**func)(void *address);
|
||||
|
||||
if (!memory_initialized) {
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
|
@ -991,17 +1032,16 @@ void *blas_memory_alloc(int procpos){
|
|||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
memory_initialized = 1;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Alloc Start ...\n");
|
||||
|
@ -1012,7 +1052,7 @@ void *blas_memory_alloc(int procpos){
|
|||
mypos = WhereAmI();
|
||||
|
||||
position = mypos;
|
||||
while (position > NUM_BUFFERS) position >>= 1;
|
||||
while (position >= NUM_BUFFERS) position >>= 1;
|
||||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
|
@ -1034,14 +1074,14 @@ void *blas_memory_alloc(int procpos){
|
|||
position = 0;
|
||||
|
||||
do {
|
||||
if (!memory[position].used) {
|
||||
/* if (!memory[position].used) { */
|
||||
|
||||
blas_lock(&memory[position].lock);
|
||||
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
blas_unlock(&memory[position].lock);
|
||||
}
|
||||
/* } */
|
||||
|
||||
position ++;
|
||||
|
||||
|
@ -1103,7 +1143,9 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
memory[position].addr = map_address;
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
||||
|
@ -1157,9 +1199,10 @@ void blas_memory_free(void *free_area){
|
|||
#endif
|
||||
|
||||
position = 0;
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
while ((memory[position].addr != free_area)
|
||||
&& (position < NUM_BUFFERS)) position++;
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
if (memory[position].addr != free_area) goto error;
|
||||
|
||||
|
@ -1171,6 +1214,7 @@ void blas_memory_free(void *free_area){
|
|||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap Succeeded.\n\n");
|
||||
|
@ -1185,6 +1229,7 @@ void blas_memory_free(void *free_area){
|
|||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -1471,12 +1516,30 @@ static int on_process_term(void)
|
|||
#else
|
||||
#pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
#endif
|
||||
#pragma data_seg(push, old_seg)
|
||||
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg(".CRT$XLB")
|
||||
#else
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
#endif
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg()
|
||||
#else
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg(".CRT$XTU")
|
||||
#else
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
#endif
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg(pop, old_seg)
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg()
|
||||
#else
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||
|
|
|
@ -167,7 +167,7 @@ int get_L2_size(void){
|
|||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
@ -251,7 +251,7 @@ int get_L2_size(void){
|
|||
void blas_set_parameter(void){
|
||||
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
@ -497,13 +497,13 @@ void blas_set_parameter(void){
|
|||
if (xgemm_p == 0) xgemm_p = 64;
|
||||
#endif
|
||||
|
||||
sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1);
|
||||
dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1);
|
||||
cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1);
|
||||
zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1);
|
||||
sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
|
||||
dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
|
||||
cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
|
||||
zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
|
||||
#ifdef QUAD_PRECISION
|
||||
qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1);
|
||||
xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1);
|
||||
qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
|
||||
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
|
||||
#endif
|
||||
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
|
@ -727,3 +727,38 @@ void blas_set_parameter(void){
|
|||
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_ARM64)
|
||||
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
unsigned long dgemm_prefetch_size_a;
|
||||
unsigned long dgemm_prefetch_size_b;
|
||||
unsigned long dgemm_prefetch_size_c;
|
||||
#endif
|
||||
|
||||
void blas_set_parameter(void)
|
||||
{
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
dgemm_p = 160;
|
||||
dgemm_q = 128;
|
||||
dgemm_r = 4096;
|
||||
|
||||
sgemm_p = 128;
|
||||
sgemm_q = 352;
|
||||
sgemm_r = 4096;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 224;
|
||||
cgemm_r = 4096;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 112;
|
||||
zgemm_r = 4096;
|
||||
|
||||
dgemm_prefetch_size_a = 3584;
|
||||
dgemm_prefetch_size_b = 512;
|
||||
dgemm_prefetch_size_c = 128;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -46,10 +46,16 @@
|
|||
#define printf _cprintf
|
||||
#endif
|
||||
|
||||
#ifdef INTERFACE64
|
||||
#define MSGFMT " ** On entry to %6s parameter number %2ld had an illegal value\n"
|
||||
#else
|
||||
#define MSGFMT " ** On entry to %6s parameter number %2d had an illegal value\n"
|
||||
#endif
|
||||
|
||||
#ifdef __ELF__
|
||||
int __xerbla(char *message, blasint *info, blasint length){
|
||||
|
||||
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
|
||||
printf(MSGFMT,
|
||||
message, *info);
|
||||
|
||||
return 0;
|
||||
|
@ -61,7 +67,7 @@ int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("_
|
|||
|
||||
int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){
|
||||
|
||||
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
|
||||
printf(MSGFMT,
|
||||
message, *info);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -118,10 +118,16 @@ endif
|
|||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
INTERNALNAME = $(LIBPREFIX).so
|
||||
else
|
||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
|
||||
else
|
||||
|
@ -132,13 +138,13 @@ endif
|
|||
ifneq ($(C_COMPILER), LSB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else
|
||||
#for LSB
|
||||
env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
endif
|
||||
rm -f linktest
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
#!/bin/bash
|
||||
|
||||
while read OBJ; do
|
||||
|
||||
if echo "$OBJ"|grep "_$" >/dev/null
|
||||
then
|
||||
[ "$OBJ" = "caxpyc_" ] && continue
|
||||
[ "$OBJ" = "zaxpyc_" ] && continue
|
||||
[ "$OBJ" = "blas_thread_shutdown_" ] && continue
|
||||
|
||||
O1=$(echo "$OBJ"|sed -e 's/_$//' )
|
||||
|
||||
if grep -w "$O1" exports/gensymbol >/dev/null
|
||||
then
|
||||
true
|
||||
else
|
||||
echo "$O1"
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$OBJ"|grep "^cblas" >/dev/null
|
||||
then
|
||||
|
||||
if grep -w "$OBJ" exports/gensymbol >/dev/null
|
||||
then
|
||||
true
|
||||
else
|
||||
echo "$OBJ"
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$OBJ"|grep "^LAPACKE" >/dev/null
|
||||
then
|
||||
|
||||
if grep -w "$OBJ" exports/gensymbol >/dev/null
|
||||
then
|
||||
true
|
||||
else
|
||||
echo "$OBJ"
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$OBJ"|grep "^lapack" >/dev/null
|
||||
then
|
||||
|
||||
if grep -w "$OBJ" exports/gensymbol >/dev/null
|
||||
then
|
||||
true
|
||||
else
|
||||
echo "$OBJ"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
done
|
||||
|
|
@ -1,11 +1,27 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# Changelog
|
||||
# 2017/09/03 staticfloat
|
||||
# Added zsymv and csymv into @lapackobjs2 so they are properly renamed
|
||||
#
|
||||
# 2017/07/01 Saar
|
||||
# removed zsymv_ and csymv_ from @blasobs, because these functions
|
||||
# are now in lapack-3.7.0
|
||||
# added blas_thread_shutdown_
|
||||
# added Cblas_cgemm3m and Cblas_zgemm3m
|
||||
# added somatcopy_, simatcopy_ ...
|
||||
# added new functions from lapack-3.7.0
|
||||
# added LAPACKE deprecated objs from lapack-3.7.0
|
||||
#
|
||||
# 2017/08/01 Saar
|
||||
# removed blas_thread_shutdown_
|
||||
#
|
||||
@blasobjs = (
|
||||
caxpy,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru,
|
||||
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,
|
||||
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,
|
||||
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
|
||||
ctrsv, csymv,
|
||||
ctrsv,
|
||||
damax,damin,dasum,daxpy,dcabs1,dcopy,ddot,dgbmv,dgemm,
|
||||
dgemv,dger,dmax,dmin,dnrm2,drot,drotg,drotm,drotmg,dsbmv,
|
||||
dscal,dsdot,dspmv,dspr2,
|
||||
|
@ -21,10 +37,18 @@
|
|||
zdscal,zgbmv,zgemm,zgemv,zgerc,zgeru,
|
||||
zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2,
|
||||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv,
|
||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
||||
xerbla,
|
||||
saxpby,daxpby,caxpby,zaxpby,
|
||||
sgeadd,dgeadd,cgeadd,zgeadd,
|
||||
somatcopy,
|
||||
simatcopy,
|
||||
domatcopy,
|
||||
dimatcopy,
|
||||
comatcopy,
|
||||
cimatcopy,
|
||||
zomatcopy,
|
||||
zimatcopy,
|
||||
);
|
||||
|
||||
@cblasobjs = (
|
||||
|
@ -80,6 +104,12 @@
|
|||
cgemm3m,zgemm3m
|
||||
);
|
||||
|
||||
@cblasgemm3mobjs = (
|
||||
cblas_cgemm3m,cblas_zgemm3m
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
#both underscore and no underscore
|
||||
@misc_common_objs = (
|
||||
|
@ -239,7 +269,7 @@
|
|||
spotrs,
|
||||
|
||||
# CLASRC -- Single precision complex LAPACK routines
|
||||
# already provided by @blasobjs: csymv
|
||||
# already provided by @blasobjs:
|
||||
# already provided by @lapackobjs:
|
||||
# cgesv, cgetf2, claswp, clauu2, clauum, cpotf2, cpotri, ctrti2, ctrtri
|
||||
cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx,
|
||||
|
@ -286,6 +316,7 @@
|
|||
cspsvx, csptrf, csptri, csptrs, csrscl, cstedc,
|
||||
cstegr, cstein, csteqr,
|
||||
csycon,
|
||||
csymv,
|
||||
csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, csytri2, csytri2x,
|
||||
csyswapr, csytrs, csytrs2, csyconv,
|
||||
ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2,
|
||||
|
@ -379,7 +410,7 @@
|
|||
dtpqrt, dtpqrt2, dtpmqrt, dtprfb,
|
||||
|
||||
# ZLASRC -- Double precision complex LAPACK routines
|
||||
# already provided by @blasobjs: zsymv
|
||||
# already provided by @blasobjs:
|
||||
# already provided by @lapackobjs:
|
||||
# zgesv, zgetrs, zgetf2, zlaswp, zlauu2, zlauum, zpotf2, zpotrf, zpotri,
|
||||
# ztrti2, ztrtri
|
||||
|
@ -430,6 +461,7 @@
|
|||
zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc,
|
||||
zstegr, zstein, zsteqr,
|
||||
zsycon,
|
||||
zsymv,
|
||||
zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, zsytri2, zsytri2x,
|
||||
zsyswapr, zsytrs, zsytrs2, zsyconv,
|
||||
ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2,
|
||||
|
@ -581,8 +613,208 @@
|
|||
zsbmv,
|
||||
zspr2,
|
||||
zsyr2,
|
||||
zunm22
|
||||
zunm22,
|
||||
|
||||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
sgemlqt,
|
||||
sgetsls,
|
||||
sgeqr,
|
||||
slatsqr,
|
||||
slamtsqr,
|
||||
sgemqr,
|
||||
sgelq,
|
||||
slaswlq,
|
||||
slamswlq,
|
||||
sgemlq,
|
||||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
dgemlqt,
|
||||
dgetsls,
|
||||
dgeqr,
|
||||
dlatsqr,
|
||||
dlamtsqr,
|
||||
dgemqr,
|
||||
dgelq,
|
||||
dlaswlq,
|
||||
dlamswlq,
|
||||
dgemlq,
|
||||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
cgemlqt,
|
||||
cgetsls,
|
||||
cgeqr,
|
||||
clatsqr,
|
||||
clamtsqr,
|
||||
cgemqr,
|
||||
cgelq,
|
||||
claswlq,
|
||||
clamswlq,
|
||||
cgemlq,
|
||||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
ztpmlqt,
|
||||
zgelqt,
|
||||
zgelqt3,
|
||||
zgemlqt,
|
||||
zgetsls,
|
||||
zgeqr,
|
||||
zlatsqr,
|
||||
zlamtsqr,
|
||||
zgemqr,
|
||||
zgelq,
|
||||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
|
@ -597,6 +829,34 @@
|
|||
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
|
||||
);
|
||||
|
||||
@lapacke_deprecated_objs = (
|
||||
LAPACKE_cggsvp,
|
||||
LAPACKE_cggsvp_work,
|
||||
LAPACKE_dggsvp,
|
||||
LAPACKE_dggsvp_work,
|
||||
LAPACKE_sggsvp,
|
||||
LAPACKE_sggsvp_work,
|
||||
LAPACKE_zggsvp,
|
||||
LAPACKE_zggsvp_work,
|
||||
LAPACKE_cggsvd,
|
||||
LAPACKE_cggsvd_work,
|
||||
LAPACKE_dggsvd,
|
||||
LAPACKE_dggsvd_work,
|
||||
LAPACKE_sggsvd,
|
||||
LAPACKE_sggsvd_work,
|
||||
LAPACKE_zggsvd,
|
||||
LAPACKE_zggsvd_work,
|
||||
LAPACKE_cgeqpf,
|
||||
LAPACKE_cgeqpf_work,
|
||||
LAPACKE_dgeqpf,
|
||||
LAPACKE_dgeqpf_work,
|
||||
LAPACKE_sgeqpf,
|
||||
LAPACKE_sgeqpf_work,
|
||||
LAPACKE_zgeqpf,
|
||||
LAPACKE_zgeqpf_work,
|
||||
);
|
||||
|
||||
|
||||
@lapackeobjs = (
|
||||
# LAPACK C interface routines.
|
||||
#
|
||||
|
@ -2948,11 +3208,191 @@
|
|||
LAPACKE_zsytrs_rook,
|
||||
LAPACKE_zsytrs_rook_work,
|
||||
LAPACKE_zuncsd2by1,
|
||||
LAPACKE_zuncsd2by1_work
|
||||
LAPACKE_zuncsd2by1_work,
|
||||
|
||||
## new function from lapack-3.7.0
|
||||
|
||||
LAPACKE_cgemqr,
|
||||
LAPACKE_cgemqr_work,
|
||||
LAPACKE_cgetsls,
|
||||
LAPACKE_cgetsls_work,
|
||||
LAPACKE_chbev_2stage,
|
||||
LAPACKE_chbev_2stage_work,
|
||||
LAPACKE_chbevd_2stage,
|
||||
LAPACKE_chbevd_2stage_work,
|
||||
LAPACKE_chbevx_2stage,
|
||||
LAPACKE_chbevx_2stage_work,
|
||||
LAPACKE_checon_3,
|
||||
LAPACKE_checon_3_work,
|
||||
LAPACKE_cheev_2stage,
|
||||
LAPACKE_cheev_2stage_work,
|
||||
LAPACKE_cheevd_2stage,
|
||||
LAPACKE_cheevd_2stage_work,
|
||||
LAPACKE_cheevr_2stage,
|
||||
LAPACKE_cheevr_2stage_work,
|
||||
LAPACKE_cheevx_2stage,
|
||||
LAPACKE_cheevx_2stage_work,
|
||||
LAPACKE_chegv_2stage,
|
||||
LAPACKE_chegv_2stage_work,
|
||||
LAPACKE_chesv_aa,
|
||||
LAPACKE_chesv_aa_work,
|
||||
LAPACKE_chesv_rk,
|
||||
LAPACKE_chesv_rk_work,
|
||||
LAPACKE_chetrf_aa,
|
||||
LAPACKE_chetrf_aa_work,
|
||||
LAPACKE_chetrf_rk,
|
||||
LAPACKE_chetrf_rk_work,
|
||||
LAPACKE_chetri_3,
|
||||
LAPACKE_chetri_3_work,
|
||||
LAPACKE_chetrs_aa,
|
||||
LAPACKE_chetrs_aa_work,
|
||||
LAPACKE_chetrs_3,
|
||||
LAPACKE_chetrs_3_work,
|
||||
LAPACKE_csycon_3,
|
||||
LAPACKE_csycon_3_work,
|
||||
LAPACKE_csysv_aa,
|
||||
LAPACKE_csysv_aa_work,
|
||||
LAPACKE_csysv_rk,
|
||||
LAPACKE_csysv_rk_work,
|
||||
LAPACKE_csytrf_aa,
|
||||
LAPACKE_csytrf_aa_work,
|
||||
LAPACKE_csytrf_rk,
|
||||
LAPACKE_csytrf_rk_work,
|
||||
LAPACKE_csytri_3,
|
||||
LAPACKE_csytri_3_work,
|
||||
LAPACKE_csytrs_aa,
|
||||
LAPACKE_csytrs_aa_work,
|
||||
LAPACKE_csytrs_3,
|
||||
LAPACKE_csytrs_3_work,
|
||||
LAPACKE_dgemqr,
|
||||
LAPACKE_dgemqr_work,
|
||||
LAPACKE_dgetsls,
|
||||
LAPACKE_dgetsls_work,
|
||||
LAPACKE_dsbev_2stage,
|
||||
LAPACKE_dsbev_2stage_work,
|
||||
LAPACKE_dsbevd_2stage,
|
||||
LAPACKE_dsbevd_2stage_work,
|
||||
LAPACKE_dsbevx_2stage,
|
||||
LAPACKE_dsbevx_2stage_work,
|
||||
LAPACKE_dsycon_3,
|
||||
LAPACKE_dsycon_3_work,
|
||||
LAPACKE_dsyev_2stage,
|
||||
LAPACKE_dsyev_2stage_work,
|
||||
LAPACKE_dsyevd_2stage,
|
||||
LAPACKE_dsyevd_2stage_work,
|
||||
LAPACKE_dsyevr_2stage,
|
||||
LAPACKE_dsyevr_2stage_work,
|
||||
LAPACKE_dsyevx_2stage,
|
||||
LAPACKE_dsyevx_2stage_work,
|
||||
LAPACKE_dsygv_2stage,
|
||||
LAPACKE_dsygv_2stage_work,
|
||||
LAPACKE_dsysv_aa,
|
||||
LAPACKE_dsysv_aa_work,
|
||||
LAPACKE_dsysv_rk,
|
||||
LAPACKE_dsysv_rk_work,
|
||||
LAPACKE_dsytrf_aa,
|
||||
LAPACKE_dsytrf_aa_work,
|
||||
LAPACKE_dsytrf_rk,
|
||||
LAPACKE_dsytrf_rk_work,
|
||||
LAPACKE_dsytri_3,
|
||||
LAPACKE_dsytri_3_work,
|
||||
LAPACKE_dsytrs_aa,
|
||||
LAPACKE_dsytrs_aa_work,
|
||||
LAPACKE_dsytrs_3,
|
||||
LAPACKE_dsytrs_3_work,
|
||||
LAPACKE_sgemqr,
|
||||
LAPACKE_sgemqr_work,
|
||||
LAPACKE_sgetsls,
|
||||
LAPACKE_sgetsls_work,
|
||||
LAPACKE_ssbev_2stage,
|
||||
LAPACKE_ssbev_2stage_work,
|
||||
LAPACKE_ssbevd_2stage,
|
||||
LAPACKE_ssbevd_2stage_work,
|
||||
LAPACKE_ssbevx_2stage,
|
||||
LAPACKE_ssbevx_2stage_work,
|
||||
LAPACKE_ssycon_3,
|
||||
LAPACKE_ssycon_3_work,
|
||||
LAPACKE_ssyev_2stage,
|
||||
LAPACKE_ssyev_2stage_work,
|
||||
LAPACKE_ssyevd_2stage,
|
||||
LAPACKE_ssyevd_2stage_work,
|
||||
LAPACKE_ssyevr_2stage,
|
||||
LAPACKE_ssyevr_2stage_work,
|
||||
LAPACKE_ssyevx_2stage,
|
||||
LAPACKE_ssyevx_2stage_work,
|
||||
LAPACKE_ssygv_2stage,
|
||||
LAPACKE_ssygv_2stage_work,
|
||||
LAPACKE_ssysv_aa,
|
||||
LAPACKE_ssysv_aa_work,
|
||||
LAPACKE_ssysv_rk,
|
||||
LAPACKE_ssysv_rk_work,
|
||||
LAPACKE_ssytrf_aa,
|
||||
LAPACKE_ssytrf_aa_work,
|
||||
LAPACKE_ssytrf_rk,
|
||||
LAPACKE_ssytrf_rk_work,
|
||||
LAPACKE_ssytri_3,
|
||||
LAPACKE_ssytri_3_work,
|
||||
LAPACKE_ssytrs_aa,
|
||||
LAPACKE_ssytrs_aa_work,
|
||||
LAPACKE_ssytrs_3,
|
||||
LAPACKE_ssytrs_3_work,
|
||||
LAPACKE_zgemqr,
|
||||
LAPACKE_zgemqr_work,
|
||||
LAPACKE_zgetsls,
|
||||
LAPACKE_zgetsls_work,
|
||||
LAPACKE_zhbev_2stage,
|
||||
LAPACKE_zhbev_2stage_work,
|
||||
LAPACKE_zhbevd_2stage,
|
||||
LAPACKE_zhbevd_2stage_work,
|
||||
LAPACKE_zhbevx_2stage,
|
||||
LAPACKE_zhbevx_2stage_work,
|
||||
LAPACKE_zhecon_3,
|
||||
LAPACKE_zhecon_3_work,
|
||||
LAPACKE_zheev_2stage,
|
||||
LAPACKE_zheev_2stage_work,
|
||||
LAPACKE_zheevd_2stage,
|
||||
LAPACKE_zheevd_2stage_work,
|
||||
LAPACKE_zheevr_2stage,
|
||||
LAPACKE_zheevr_2stage_work,
|
||||
LAPACKE_zheevx_2stage,
|
||||
LAPACKE_zheevx_2stage_work,
|
||||
LAPACKE_zhegv_2stage,
|
||||
LAPACKE_zhegv_2stage_work,
|
||||
LAPACKE_zhesv_aa,
|
||||
LAPACKE_zhesv_aa_work,
|
||||
LAPACKE_zhesv_rk,
|
||||
LAPACKE_zhesv_rk_work,
|
||||
LAPACKE_zhetrf_aa,
|
||||
LAPACKE_zhetrf_aa_work,
|
||||
LAPACKE_zhetrf_rk,
|
||||
LAPACKE_zhetrf_rk_work,
|
||||
LAPACKE_zhetri_3,
|
||||
LAPACKE_zhetri_3_work,
|
||||
LAPACKE_zhetrs_aa,
|
||||
LAPACKE_zhetrs_aa_work,
|
||||
LAPACKE_zhetrs_3,
|
||||
LAPACKE_zhetrs_3_work,
|
||||
LAPACKE_zsycon_3,
|
||||
LAPACKE_zsycon_3_work,
|
||||
LAPACKE_zsysv_aa,
|
||||
LAPACKE_zsysv_aa_work,
|
||||
LAPACKE_zsysv_rk,
|
||||
LAPACKE_zsysv_rk_work,
|
||||
LAPACKE_zsytrf_aa,
|
||||
LAPACKE_zsytrf_aa_work,
|
||||
LAPACKE_zsytrf_rk,
|
||||
LAPACKE_zsytrf_rk_work,
|
||||
LAPACKE_zsytri_3,
|
||||
LAPACKE_zsytri_3_work,
|
||||
LAPACKE_zsytrs_aa,
|
||||
LAPACKE_zsytrs_aa_work,
|
||||
LAPACKE_zsytrs_3,
|
||||
LAPACKE_zsytrs_3_work,
|
||||
);
|
||||
|
||||
#These function may need 2 underscores.
|
||||
@lapack_embeded_underscore_objs=(xerbla_array, chla_transtype, slasyf_rook,
|
||||
@lapack_embeded_underscore_objs=(
|
||||
xerbla_array, chla_transtype, slasyf_rook,
|
||||
ssytf2_rook, ssytrf_rook, ssytrs_rook,
|
||||
ssytri_rook, ssycon_rook, ssysv_rook,
|
||||
chetf2_rook, chetrf_rook, chetri_rook,
|
||||
|
@ -2968,11 +3408,9 @@
|
|||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
if ($ARGV[8] == 1) {
|
||||
#ONLY_CBLAS=1
|
||||
@underscore_objs = (@misc_underscore_objs);
|
||||
|
@ -2980,7 +3418,6 @@ if ($ARGV[8] == 1) {
|
|||
#NO_LAPACK=1
|
||||
@underscore_objs = (@blasobjs, @misc_underscore_objs);
|
||||
} elsif (-d "../lapack-netlib") {
|
||||
|
||||
if ($ARGV[7] == 0) {
|
||||
# NEED2UNDERSCORES=0
|
||||
# Don't need 2 underscores
|
||||
|
@ -2995,7 +3432,6 @@ if ($ARGV[8] == 1) {
|
|||
#BUILD_LAPACK_DEPRECATED=1
|
||||
@underscore_objs = (@underscore_objs, @lapack_deprecated_objs);
|
||||
}
|
||||
|
||||
} else {
|
||||
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
||||
}
|
||||
|
@ -3006,16 +3442,16 @@ if ($ARGV[8] == 1) {
|
|||
@exblasobjs=();
|
||||
}
|
||||
|
||||
if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); };
|
||||
if ($ARGV[3] == 1) {
|
||||
@underscore_objs = (@underscore_objs, @exblasobjs);
|
||||
};
|
||||
|
||||
if ($ARGV[1] eq "x86_64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||
|
||||
if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||
|
||||
if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||
|
||||
if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||
|
||||
|
||||
if ($ARGV[4] == 0) {
|
||||
@no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs);
|
||||
}else{
|
||||
|
@ -3025,9 +3461,14 @@ if ($ARGV[4] == 0) {
|
|||
if ($ARGV[6] == 1) {
|
||||
#NO_LAPACKE=1
|
||||
@no_underscore_objs = (@no_underscore_objs);
|
||||
} else {
|
||||
if ($ARGV[11] == 1) {
|
||||
#BUILD_LAPACK_DEPRECATED=1
|
||||
@no_underscore_objs = (@no_underscore_objs, @lapackeobjs, @lapacke_deprecated_objs);
|
||||
} else {
|
||||
@no_underscore_objs = (@no_underscore_objs, @lapackeobjs);
|
||||
}
|
||||
}
|
||||
|
||||
@hplobjs = (daxpy, dcopy, dscal, idamax, dgemv, dtrsv, dger, dgemm, dtrsm);
|
||||
@hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T);
|
||||
|
@ -3041,7 +3482,6 @@ $symbolprefix = $ARGV[9];
|
|||
$symbolsuffix = $ARGV[10];
|
||||
|
||||
if ($ARGV[0] eq "osx") {
|
||||
|
||||
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||
|
||||
|
@ -3053,16 +3493,13 @@ if ($ARGV[0] eq "osx"){
|
|||
print "_", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n";
|
||||
}
|
||||
|
||||
# if ($ARGV[4] == 0) {
|
||||
foreach $objs (@no_underscore_objs) {
|
||||
print "_", $symbolprefix, $objs, $symbolsuffix, "\n";
|
||||
}
|
||||
# }
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if ($ARGV[0] eq "aix"){
|
||||
|
||||
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||
|
||||
|
@ -3074,16 +3511,13 @@ if ($ARGV[0] eq "aix"){
|
|||
print $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n";
|
||||
}
|
||||
|
||||
# if ($ARGV[4] == 0) {
|
||||
foreach $objs (@no_underscore_objs) {
|
||||
print $symbolprefix, $objs, $symbolsuffix, "\n";
|
||||
}
|
||||
# }
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if ($ARGV[0] eq "objcopy") {
|
||||
|
||||
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||
|
||||
|
@ -3095,16 +3529,13 @@ if ($ARGV[0] eq "objcopy"){
|
|||
print $objs, $bu, $bu, " ", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n";
|
||||
}
|
||||
|
||||
# if ($ARGV[4] == 0) {
|
||||
foreach $objs (@no_underscore_objs) {
|
||||
print $objs, " ", $symbolprefix, $objs, $symbolsuffix, "\n";
|
||||
}
|
||||
# }
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if ($ARGV[0] eq "objconv") {
|
||||
|
||||
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||
|
||||
|
@ -3116,11 +3547,9 @@ if ($ARGV[0] eq "objconv"){
|
|||
print "-nr:_", $objs, $bu, $bu, ":_", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n";
|
||||
}
|
||||
|
||||
# if ($ARGV[4] == 0) {
|
||||
foreach $objs (@no_underscore_objs) {
|
||||
print "-nr:_", $objs, ":_", $symbolprefix, $objs, $symbolsuffix, "\n";
|
||||
}
|
||||
# }
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
@ -3154,7 +3583,6 @@ if ($ARGV[0] eq "win2k"){
|
|||
|
||||
#for misc_common_objs
|
||||
foreach $objs (@misc_common_objs) {
|
||||
|
||||
$uppercase = $objs;
|
||||
$uppercase =~ tr/[a-z]/[A-Z]/;
|
||||
print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n";
|
||||
|
@ -3186,16 +3614,10 @@ if ($ARGV[0] eq "win2khpl"){
|
|||
$count ++;
|
||||
}
|
||||
|
||||
# foreach $objs (@hplobjs2) {
|
||||
# print "\t$objs=$objs"," \@", $count, "\n";
|
||||
# $count ++;
|
||||
# }
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if ($ARGV[0] eq "microsoft"){
|
||||
|
||||
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||
|
||||
print "EXPORTS\n";
|
||||
|
@ -3228,7 +3650,6 @@ if ($ARGV[0] eq "microsoft"){
|
|||
}
|
||||
|
||||
if ($ARGV[0] eq "linktest") {
|
||||
|
||||
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||
|
||||
|
@ -3241,16 +3662,10 @@ if ($ARGV[0] eq "linktest"){
|
|||
print $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "();\n";
|
||||
}
|
||||
|
||||
# if ($ARGV[4] == 0) {
|
||||
foreach $objs (@no_underscore_objs) {
|
||||
print $symbolprefix, $objs, $symbolsuffix, "();\n";
|
||||
}
|
||||
# }
|
||||
|
||||
|
||||
|
||||
|
||||
print "return 0;}\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
|
22
f_check
22
f_check
|
@ -33,6 +33,7 @@ if ($compiler eq "") {
|
|||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77",
|
||||
"flang",
|
||||
"ifort");
|
||||
|
||||
OUTER:
|
||||
|
@ -77,10 +78,15 @@ if ($compiler eq "") {
|
|||
if ($major >= 4) {
|
||||
$vendor = GFORTRAN;
|
||||
$openmp = "-fopenmp";
|
||||
} else {
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} else {
|
||||
$vendor = G77;
|
||||
$openmp = "";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -197,6 +203,12 @@ if ($compiler eq "") {
|
|||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
|
@ -283,6 +295,12 @@ if ($link ne "") {
|
|||
$linker_L .= "-Wl,". $flags . " ";
|
||||
}
|
||||
|
||||
if ($flags =~ /^\--exclude-libs/) {
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
$flags="";
|
||||
}
|
||||
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($vendor eq "PGI") {
|
||||
|
@ -325,6 +343,10 @@ if ($vendor eq "INTEL"){
|
|||
$linker_a .= "-lgfortran"
|
||||
}
|
||||
|
||||
if ($vendor eq "FLANG"){
|
||||
$linker_a .= "-lflang"
|
||||
}
|
||||
|
||||
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
|
||||
open(CONFFILE, ">> $config" ) || die "Can't append $config";
|
||||
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
int main(int argc, char**argv) {
|
||||
FILE *fp;
|
||||
char line[100];
|
||||
char line2[80];
|
||||
char *s;
|
||||
int i;
|
||||
|
||||
fprintf(stdout,"#ifndef OPENBLAS_CONFIG_H\n");
|
||||
fprintf(stdout,"#define OPENBLAS_CONFIG_H\n");
|
||||
fp=fopen(argv[1],"r");
|
||||
do{
|
||||
s=fgets(line,80,fp);
|
||||
if (s== NULL) break;
|
||||
memset(line2,0,80);
|
||||
i=sscanf(line,"#define %70c",line2);
|
||||
if (i!=0) {
|
||||
fprintf(stdout,"#define OPENBLAS_%s",line2);
|
||||
} else {
|
||||
fprintf(stdout,"\n");
|
||||
}
|
||||
} while (1);
|
||||
fclose(fp);
|
||||
fprintf(stdout,"#define OPENBLAS_VERSION \"OpenBLAS %s\"\n", VERSION);
|
||||
fp=fopen(argv[2],"r");
|
||||
do{
|
||||
s=fgets(line,100,fp);
|
||||
if (s== NULL) break;
|
||||
fprintf(stdout,"%s",line);
|
||||
} while(1);
|
||||
fclose(fp);
|
||||
fprintf(stdout,"#endif /* OPENBLAS_CONFIG_H */\n");
|
||||
exit(0);
|
||||
}
|
82
getarch.c
82
getarch.c
|
@ -473,6 +473,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "EXCAVATOR"
|
||||
#endif
|
||||
|
||||
#if defined (FORCE_ZEN)
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "ZEN"
|
||||
#define ARCHCONFIG "-DZEN " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL2_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=16777216 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=8 " \
|
||||
"-DITB_DEFAULT_ENTRIES=64 -DITB_SIZE=4096 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
|
||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
|
||||
"-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "zen"
|
||||
#define CORENAME "ZEN"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_SSE_GENERIC
|
||||
#define FORCE
|
||||
|
@ -884,7 +903,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef FORCE_CORTEXA57
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "ARMV8"
|
||||
#define SUBARCHITECTURE "CORTEXA57"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA57 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
|
@ -897,6 +916,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_VULCAN
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "VULCAN"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DVULCAN " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "vulcan"
|
||||
#define CORENAME "VULCAN"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "THUNDERX"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTHUNDERX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
|
||||
#define LIBNAME "thunderx"
|
||||
#define CORENAME "THUNDERX"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX2T99
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "THUNDERX2T99"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTHUNDERX2T99 " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "thunderx2t99"
|
||||
#define CORENAME "THUNDERX2T99"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
|
||||
|
@ -907,6 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#if defined(__zarch__) || defined(__s390x__)
|
||||
#define ZARCH
|
||||
#include "cpuid_zarch.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_AMD
|
||||
#include "cpuid_x86.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
|
@ -971,7 +1044,7 @@ static int get_num_cores(void) {
|
|||
|
||||
#if defined(linux) || defined(__sun__)
|
||||
//returns the number of processors which are currently online
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
return sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#elif defined(OS_WINDOWS)
|
||||
|
||||
|
@ -1006,7 +1079,7 @@ int main(int argc, char *argv[]){
|
|||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1098,6 +1171,7 @@ int main(int argc, char *argv[]){
|
|||
p ++;
|
||||
}
|
||||
} else {
|
||||
if (*p != '\n')
|
||||
printf("%c", *p);
|
||||
p ++;
|
||||
}
|
||||
|
@ -1113,7 +1187,7 @@ int main(int argc, char *argv[]){
|
|||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -84,10 +84,10 @@ CBLAS1OBJS = \
|
|||
|
||||
CBLAS2OBJS = \
|
||||
cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \
|
||||
ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) csymv.$(SUFFIX) \
|
||||
csyr.$(SUFFIX) csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
|
||||
csbmv.$(SUFFIX) cspmv.$(SUFFIX) \
|
||||
cspr.$(SUFFIX) cspr2.$(SUFFIX) \
|
||||
ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \
|
||||
csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
|
||||
csbmv.$(SUFFIX) \
|
||||
cspr2.$(SUFFIX) \
|
||||
ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \
|
||||
ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \
|
||||
chemv.$(SUFFIX) chbmv.$(SUFFIX) \
|
||||
|
@ -113,10 +113,10 @@ ZBLAS1OBJS = \
|
|||
|
||||
ZBLAS2OBJS = \
|
||||
zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \
|
||||
ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) zsymv.$(SUFFIX) \
|
||||
zsyr.$(SUFFIX) zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
|
||||
zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \
|
||||
zspr.$(SUFFIX) zspr2.$(SUFFIX) \
|
||||
ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \
|
||||
zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
|
||||
zsbmv.$(SUFFIX) \
|
||||
zspr2.$(SUFFIX) \
|
||||
ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \
|
||||
ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \
|
||||
zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \
|
||||
|
@ -315,7 +315,7 @@ CCBLAS3OBJS = \
|
|||
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
|
||||
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
|
||||
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
|
||||
cblas_cgeadd.$(SUFFIX)
|
||||
cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX)
|
||||
|
||||
|
||||
|
||||
|
@ -2137,3 +2137,5 @@ cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c
|
|||
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
|
|
@ -42,9 +42,13 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
// benefits. Keep the multi-threading code for the record.
|
||||
#undef SMP
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
|
@ -81,7 +85,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
|
|||
if (incy < 0) y -= (n - 1) * incy;
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#ifdef CBLAS
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include "common.h"
|
||||
|
||||
void CNAME(blasint p, char *rout, char *form, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start(args, form);
|
||||
|
||||
if (p)
|
||||
fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
|
||||
vfprintf(stderr, form, args);
|
||||
va_end(args);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
|
@ -160,9 +160,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasin
|
|||
|
||||
if (n <= 0) {
|
||||
#ifdef FORCE_USE_STACK
|
||||
//*result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
|
||||
CREAL(*result) = 0.0;
|
||||
CIMAG(*result) = 0.0;
|
||||
OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
|
||||
*result = zero;
|
||||
// CREAL(*result) = 0.0;
|
||||
// CIMAG(*result) = 0.0;
|
||||
return;
|
||||
#else
|
||||
return zero;
|
||||
|
|
|
@ -125,9 +125,8 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef NEW_IMATCOPY
|
||||
if (*lda == *ldb) {
|
||||
if (*lda == *ldb && *cols == *rows) {
|
||||
if ( order == BlasColMajor )
|
||||
{
|
||||
|
||||
|
@ -180,7 +179,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
{
|
||||
printf("Memory alloc failed\n");
|
||||
printf("Memory alloc failed in zimatcopy\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
@ -205,14 +204,14 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
if ( trans == BlasTrans )
|
||||
{
|
||||
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
{
|
||||
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
|
@ -238,20 +237,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
if ( trans == BlasTrans )
|
||||
{
|
||||
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
{
|
||||
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
free(b);
|
||||
return;
|
||||
|
||||
}
|
||||
|
|
|
@ -118,7 +118,7 @@ endforeach ()
|
|||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
|
||||
if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "HASWELL")
|
||||
if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "{CORE}" STREQUAL "zen")
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -32,10 +32,18 @@ ifeq ($(CORE), HASWELL)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV5
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
|
@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
|
|||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
|
@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
|
|||
CNRM2KERNEL = nrm2_vfp.S
|
||||
ZNRM2KERNEL = nrm2_vfp.S
|
||||
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal.c
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SSWAPKERNEL = swap_vfp.S
|
||||
DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
|
@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
|
|||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
|
||||
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
endif
|
||||
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
|
||||
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
endif
|
||||
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
|
@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
|
|||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
|
|
|
@ -1,91 +1,12 @@
|
|||
|
||||
#################################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
ZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
SAMINKERNEL = iamax_vfp.S
|
||||
DAMINKERNEL = iamax_vfp.S
|
||||
CAMINKERNEL = iamax_vfp.S
|
||||
ZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
SMAXKERNEL = iamax_vfp.S
|
||||
DMAXKERNEL = iamax_vfp.S
|
||||
|
||||
SMINKERNEL = iamax_vfp.S
|
||||
DMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMAXKERNEL = iamax_vfp.S
|
||||
IDAMAXKERNEL = iamax_vfp.S
|
||||
ICAMAXKERNEL = iamax_vfp.S
|
||||
IZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMINKERNEL = iamax_vfp.S
|
||||
IDAMINKERNEL = iamax_vfp.S
|
||||
ICAMINKERNEL = iamax_vfp.S
|
||||
IZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISMAXKERNEL = iamax_vfp.S
|
||||
IDMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISMINKERNEL = iamax_vfp.S
|
||||
IDMINKERNEL = iamax_vfp.S
|
||||
|
||||
SSWAPKERNEL = swap_vfp.S
|
||||
DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
ZSWAPKERNEL = swap_vfp.S
|
||||
|
||||
SASUMKERNEL = asum_vfp.S
|
||||
DASUMKERNEL = asum_vfp.S
|
||||
CASUMKERNEL = asum_vfp.S
|
||||
ZASUMKERNEL = asum_vfp.S
|
||||
|
||||
SAXPYKERNEL = axpy_vfp.S
|
||||
DAXPYKERNEL = axpy_vfp.S
|
||||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
CDOTKERNEL = cdot_vfp.S
|
||||
ZDOTKERNEL = zdot_vfp.S
|
||||
include $(KERNELDIR)/KERNEL.ARMV6
|
||||
|
||||
SNRM2KERNEL = nrm2_vfpv3.S
|
||||
DNRM2KERNEL = nrm2_vfpv3.S
|
||||
CNRM2KERNEL = nrm2_vfpv3.S
|
||||
ZNRM2KERNEL = nrm2_vfpv3.S
|
||||
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal.c
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
DGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
ZGEMVNKERNEL = zgemv_n_vfp.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t_vfp.S
|
||||
DGEMVTKERNEL = gemv_t_vfp.S
|
||||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
|
||||
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
|
||||
|
@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
|
|||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
|
||||
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
|
|
|
@ -475,6 +475,14 @@ asum_kernel_L999:
|
|||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -38,10 +38,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_X [fp, #0 ]
|
||||
#define OLD_INC_X [fp, #4 ]
|
||||
#define OLD_Y [fp, #8 ]
|
||||
#define OLD_INC_Y [fp, #12 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0]
|
||||
#define OLD_X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define OLD_Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#endif
|
||||
|
||||
#else //COMPLEX
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define OLD_Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#else
|
||||
#define OLD_ALPHAR [fp, #0]
|
||||
#define OLD_ALPHAI [fp, #8]
|
||||
#define OLD_X [fp, #16 ]
|
||||
#define OLD_INC_X [fp, #20 ]
|
||||
#define OLD_Y [fp, #24 ]
|
||||
#define OLD_INC_Y [fp, #28 ]
|
||||
#endif
|
||||
|
||||
#endif //!defined(COMPLEX)
|
||||
|
||||
#else //__ARM_PCS_VFP
|
||||
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
|
||||
#endif //!defined(__ARM_PCS_VFP)
|
||||
|
||||
#define N r0
|
||||
#define Y r1
|
||||
|
@ -64,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(DOUBLE)
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
|
@ -83,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -363,6 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #8
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr X, OLD_X
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
#else //COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr X, OLD_X
|
||||
#else
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
|
|
@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
#define OLD_Y r3
|
||||
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
|
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
* registers
|
||||
*******************************************************/
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_RETURN_ADDR r0
|
||||
#define OLD_N r1
|
||||
#define OLD_X r2
|
||||
#define OLD_INC_X r3
|
||||
#define OLD_Y [fp, #0 ]
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#define RETURN_ADDR r8
|
||||
#else
|
||||
#define OLD_Y r3
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
#endif
|
||||
|
||||
#define I r5
|
||||
#define Y r6
|
||||
|
@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 5
|
||||
|
||||
push {r4 - r9, fp}
|
||||
add fp, sp, #24
|
||||
add fp, sp, #28
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
sub r4, fp, #128
|
||||
|
@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmov s2, s0
|
||||
vmov s3, s0
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
mov RETURN_ADDR, OLD_RETURN_ADDR
|
||||
mov N, OLD_N
|
||||
mov X, OLD_X
|
||||
mov INC_X, OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#else
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble cdot_kernel_L999
|
||||
|
@ -265,7 +283,6 @@ cdot_kernel_S10:
|
|||
|
||||
|
||||
cdot_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { s8 - s15} // restore floating point registers
|
||||
|
||||
|
@ -276,8 +293,11 @@ cdot_kernel_L999:
|
|||
vadd.f32 s0 , s0, s2
|
||||
vsub.f32 s1 , s1, s3
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vstm RETURN_ADDR, {s0 - s1}
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
|
|
|
@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R1 vmls.f32
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
|
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
|
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R1 vmls.f32
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define OLD_LDA [fp, #8 ]
|
||||
#define X [fp, #12 ]
|
||||
#define OLD_INC_X [fp, #16 ]
|
||||
#define Y [fp, #20 ]
|
||||
#define OLD_INC_Y [fp, #24 ]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
|
@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp N, #0
|
||||
ble cgemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
vstr s0 , ALPHA_R
|
||||
|
|
|
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define OLD_LDA [fp, #8 ]
|
||||
#define X [fp, #12 ]
|
||||
#define OLD_INC_X [fp, #16 ]
|
||||
#define Y [fp, #20 ]
|
||||
#define OLD_INC_Y [fp, #24 ]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
|
@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp OLD_N, #0
|
||||
ble cgemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
|
|
@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#define OFFSET [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#define OFFSET [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmuls
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R1 vnmul.f32
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmuls
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
|
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmuls
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmuls
|
||||
#define FMAC_I1 vnmul.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmuls
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmuls
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
|
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmuls
|
||||
#define FMAC_R1 vnmul.f32
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmuls
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I1 vnmul.f32
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -246,6 +246,9 @@ ddot_kernel_L999:
|
|||
vldm r3, { d8 - d15} // restore floating point registers
|
||||
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
sub sp, fp, #24
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
|
|
@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define OLD_C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#define OFFSET [fp, #28 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define OLD_C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#define OFFSET [fp, #28 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue