Merge remote-tracking branch 'upstream/develop' into develop
This commit is contained in:
commit
d9346930dd
|
@ -5,6 +5,7 @@
|
||||||
*.def
|
*.def
|
||||||
*.o
|
*.o
|
||||||
*.out
|
*.out
|
||||||
|
*.tmp
|
||||||
lapack-3.1.1
|
lapack-3.1.1
|
||||||
lapack-3.1.1.tgz
|
lapack-3.1.1.tgz
|
||||||
lapack-3.4.1
|
lapack-3.4.1
|
||||||
|
|
144
.travis.yml
144
.travis.yml
|
@ -1,4 +1,119 @@
|
||||||
|
# XXX: Precise is already deprecated, new default is Trusty.
|
||||||
|
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
|
||||||
|
dist: precise
|
||||||
|
sudo: false
|
||||||
language: c
|
language: c
|
||||||
|
compiler: gcc
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
include:
|
||||||
|
- &test-ubuntu
|
||||||
|
stage: test
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
packages:
|
||||||
|
- gfortran
|
||||||
|
before_script: &common-before
|
||||||
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||||
|
script:
|
||||||
|
- set -e
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
|
- make -C test $COMMON_FLAGS $BTYPE
|
||||||
|
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||||
|
- make -C utest $COMMON_FLAGS $BTYPE
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64
|
||||||
|
- BTYPE="BINARY=64"
|
||||||
|
|
||||||
|
- <<: *test-ubuntu
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64
|
||||||
|
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||||
|
|
||||||
|
- <<: *test-ubuntu
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64
|
||||||
|
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||||
|
|
||||||
|
- <<: *test-ubuntu
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
packages:
|
||||||
|
- gcc-multilib
|
||||||
|
- gfortran-multilib
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX32
|
||||||
|
- BTYPE="BINARY=32"
|
||||||
|
|
||||||
|
- stage: test
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
packages:
|
||||||
|
- binutils-mingw-w64-x86-64
|
||||||
|
- gcc-mingw-w64-x86-64
|
||||||
|
- gfortran-mingw-w64-x86-64
|
||||||
|
before_script: *common-before
|
||||||
|
script:
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=WIN64
|
||||||
|
- BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
||||||
|
|
||||||
|
# Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
|
||||||
|
# These jobs needs sudo, so Travis runs them on VM-based infrastructure
|
||||||
|
# which is slower than container-based infrastructure used for jobs
|
||||||
|
# that don't require sudo.
|
||||||
|
- &test-alpine
|
||||||
|
stage: test
|
||||||
|
dist: trusty
|
||||||
|
sudo: true
|
||||||
|
language: minimal
|
||||||
|
before_install:
|
||||||
|
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
|
||||||
|
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
|
||||||
|
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||||
|
install:
|
||||||
|
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||||
|
before_script: *common-before
|
||||||
|
script:
|
||||||
|
- set -e
|
||||||
|
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||||
|
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
|
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||||
|
- alpine make -C test $COMMON_FLAGS $BTYPE
|
||||||
|
- alpine make -C ctest $COMMON_FLAGS $BTYPE
|
||||||
|
- alpine make -C utest $COMMON_FLAGS $BTYPE
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64_MUSL
|
||||||
|
- BTYPE="BINARY=64"
|
||||||
|
|
||||||
|
# XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
|
||||||
|
# so it's "allowed to fail" for now (see allow_failures).
|
||||||
|
- &test-alpine-openmp
|
||||||
|
<<: *test-alpine
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64_MUSL
|
||||||
|
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||||
|
|
||||||
|
- <<: *test-alpine
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64_MUSL
|
||||||
|
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||||
|
|
||||||
|
# Build with the same flags as Alpine do in OpenBLAS package.
|
||||||
|
- <<: *test-alpine
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64_MUSL
|
||||||
|
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
||||||
|
|
||||||
|
allow_failures:
|
||||||
|
- <<: *test-alpine-openmp
|
||||||
|
|
||||||
|
# whitelist
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
- develop
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
webhooks:
|
webhooks:
|
||||||
|
@ -7,32 +122,3 @@ notifications:
|
||||||
on_success: change # options: [always|never|change] default: always
|
on_success: change # options: [always|never|change] default: always
|
||||||
on_failure: always # options: [always|never|change] default: always
|
on_failure: always # options: [always|never|change] default: always
|
||||||
on_start: never # options: [always|never|change] default: always
|
on_start: never # options: [always|never|change] default: always
|
||||||
|
|
||||||
compiler:
|
|
||||||
- gcc
|
|
||||||
|
|
||||||
env:
|
|
||||||
- TARGET_BOX=LINUX64 BTYPE="BINARY=64"
|
|
||||||
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1"
|
|
||||||
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1"
|
|
||||||
- TARGET_BOX=LINUX32 BTYPE="BINARY=32"
|
|
||||||
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
|
||||||
|
|
||||||
before_install:
|
|
||||||
- sudo apt-get update -qq
|
|
||||||
- sudo apt-get install -qq gfortran
|
|
||||||
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
|
|
||||||
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
|
|
||||||
|
|
||||||
script:
|
|
||||||
- set -e
|
|
||||||
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
|
||||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
|
||||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
|
||||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
|
||||||
|
|
||||||
# whitelist
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- master
|
|
||||||
- develop
|
|
|
@ -231,43 +231,33 @@ install(TARGETS ${OpenBLAS_LIBNAME}
|
||||||
|
|
||||||
# Install include files
|
# Install include files
|
||||||
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
|
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
|
||||||
ADD_CUSTOM_COMMAND(
|
|
||||||
OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h
|
|
||||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
|
||||||
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
|
|
||||||
)
|
|
||||||
|
|
||||||
ADD_CUSTOM_TARGET(genconfig
|
execute_process(COMMAND ${GENCONFIG_BIN}
|
||||||
ALL
|
${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||||
DEPENDS openblas_config.h
|
${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h
|
||||||
)
|
OUTPUT_VARIABLE OPENBLAS_CONFIG_H_CONTENTS)
|
||||||
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
|
|
||||||
|
file(WRITE ${CMAKE_BINARY_DIR}/openblas_config.tmp "${OPENBLAS_CONFIG_H_CONTENTS}")
|
||||||
|
configure_file(${CMAKE_BINARY_DIR}/openblas_config.tmp ${CMAKE_BINARY_DIR}/openblas_config.h COPYONLY)
|
||||||
|
|
||||||
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
|
|
||||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||||
|
|
||||||
ADD_CUSTOM_TARGET(genf77blas
|
file(WRITE ${CMAKE_BINARY_DIR}/f77blas.h "")
|
||||||
ALL
|
file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n")
|
||||||
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
|
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h COMMON_INTERFACE_H_CONTENTS)
|
||||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "${COMMON_INTERFACE_H_CONTENTS}")
|
||||||
)
|
file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "#endif")
|
||||||
add_dependencies(genf77blas ${OpenBLAS_LIBNAME})
|
|
||||||
|
|
||||||
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
|
|
||||||
if(NOT NO_CBLAS)
|
if(NOT NO_CBLAS)
|
||||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||||
|
|
||||||
ADD_CUSTOM_TARGET(gencblas
|
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||||
ALL
|
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||||
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
|
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
|
||||||
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
|
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
|
||||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
|
|
||||||
)
|
|
||||||
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
|
|
||||||
|
|
||||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT NO_LAPACKE)
|
if(NOT NO_LAPACKE)
|
||||||
|
@ -277,7 +267,7 @@ if(NOT NO_LAPACKE)
|
||||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
|
|
||||||
ADD_CUSTOM_TARGET(genlapacke
|
ADD_CUSTOM_TARGET(genlapacke
|
||||||
COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||||
)
|
)
|
||||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -87,7 +87,7 @@ endif ()
|
||||||
|
|
||||||
string(TOUPPER ${ARCH} UC_ARCH)
|
string(TOUPPER ${ARCH} UC_ARCH)
|
||||||
|
|
||||||
file(WRITE ${TARGET_CONF}
|
file(WRITE ${TARGET_CONF_TEMP}
|
||||||
"#define OS_${HOST_OS}\t1\n"
|
"#define OS_${HOST_OS}\t1\n"
|
||||||
"#define ARCH_${UC_ARCH}\t1\n"
|
"#define ARCH_${UC_ARCH}\t1\n"
|
||||||
"#define C_${COMPILER_ID}\t1\n"
|
"#define C_${COMPILER_ID}\t1\n"
|
||||||
|
@ -95,7 +95,7 @@ file(WRITE ${TARGET_CONF}
|
||||||
"#define FUNDERSCORE\t${FU}\n")
|
"#define FUNDERSCORE\t${FU}\n")
|
||||||
|
|
||||||
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
|
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
|
||||||
file(APPEND ${TARGET_CONF}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define OS_WINNT\t1\n")
|
"#define OS_WINNT\t1\n")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ if (NOT ONLY_CBLAS)
|
||||||
# TODO: set FEXTRALIB flags a la f_check?
|
# TODO: set FEXTRALIB flags a la f_check?
|
||||||
|
|
||||||
set(BU "_")
|
set(BU "_")
|
||||||
file(APPEND ${TARGET_CONF}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define BUNDERSCORE _\n"
|
"#define BUNDERSCORE _\n"
|
||||||
"#define NEEDBUNDERSCORE 1\n"
|
"#define NEEDBUNDERSCORE 1\n"
|
||||||
"#define NEED2UNDERSCORES 0\n")
|
"#define NEED2UNDERSCORES 0\n")
|
||||||
|
@ -56,7 +56,7 @@ else ()
|
||||||
set(NO_FBLAS 1)
|
set(NO_FBLAS 1)
|
||||||
#set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler
|
#set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler
|
||||||
set(BU "_")
|
set(BU "_")
|
||||||
file(APPEND ${TARGET_CONF}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define BUNDERSCORE _\n"
|
"#define BUNDERSCORE _\n"
|
||||||
"#define NEEDBUNDERSCORE 1\n")
|
"#define NEEDBUNDERSCORE 1\n")
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -2391,6 +2391,6 @@ foreach (Utils_FILE ${Utils_SRC})
|
||||||
endforeach ()
|
endforeach ()
|
||||||
|
|
||||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h")
|
configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
|
||||||
include_directories(${lapacke_include_dir})
|
include_directories(${lapacke_include_dir})
|
||||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||||
|
|
|
@ -51,6 +51,7 @@ else()
|
||||||
set(TARGET_CONF "config.h")
|
set(TARGET_CONF "config.h")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp")
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
|
||||||
|
|
||||||
if (NOT NOFORTRAN)
|
if (NOT NOFORTRAN)
|
||||||
|
@ -79,10 +80,11 @@ endif ()
|
||||||
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
|
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
|
||||||
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
||||||
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
||||||
|
configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY)
|
||||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||||
SOURCES ${GETARCH_SRC}
|
SOURCES ${GETARCH_SRC}
|
||||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR}
|
||||||
OUTPUT_VARIABLE GETARCH_LOG
|
OUTPUT_VARIABLE GETARCH_LOG
|
||||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||||
)
|
)
|
||||||
|
@ -100,16 +102,17 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE G
|
||||||
message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}")
|
message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}")
|
||||||
|
|
||||||
# append config data from getarch to the TARGET file and read in CMake vars
|
# append config data from getarch to the TARGET file and read in CMake vars
|
||||||
file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT})
|
file(APPEND ${TARGET_CONF_TEMP} ${GETARCH_CONF_OUT})
|
||||||
ParseGetArchVars(${GETARCH_MAKE_OUT})
|
ParseGetArchVars(${GETARCH_MAKE_OUT})
|
||||||
|
|
||||||
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
||||||
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
||||||
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
||||||
|
configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY)
|
||||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR}
|
||||||
OUTPUT_VARIABLE GETARCH2_LOG
|
OUTPUT_VARIABLE GETARCH2_LOG
|
||||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||||
)
|
)
|
||||||
|
@ -124,7 +127,8 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE
|
||||||
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
|
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
|
||||||
|
|
||||||
# append config data from getarch_2nd to the TARGET file and read in CMake vars
|
# append config data from getarch_2nd to the TARGET file and read in CMake vars
|
||||||
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT})
|
file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT})
|
||||||
|
configure_file(${TARGET_CONF_TEMP} ${PROJECT_BINARY_DIR}/${TARGET_CONF} COPYONLY)
|
||||||
ParseGetArchVars(${GETARCH2_MAKE_OUT})
|
ParseGetArchVars(${GETARCH2_MAKE_OUT})
|
||||||
|
|
||||||
# compile get_config_h
|
# compile get_config_h
|
||||||
|
@ -144,4 +148,4 @@ if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||||
if (NOT ${GEN_CONFIG_H_RESULT})
|
if (NOT ${GEN_CONFIG_H_RESULT})
|
||||||
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
|
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -234,7 +234,9 @@ function(GenerateNamedObjects sources_in)
|
||||||
|
|
||||||
string(REPLACE ";" "\n#define " define_source "${obj_defines}")
|
string(REPLACE ";" "\n#define " define_source "${obj_defines}")
|
||||||
string(REPLACE "=" " " define_source "${define_source}")
|
string(REPLACE "=" " " define_source "${define_source}")
|
||||||
file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"")
|
file(WRITE ${new_source_file}.tmp "#define ${define_source}\n#include \"${old_source_file}\"")
|
||||||
|
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
|
||||||
|
file(REMOVE ${new_source_file}.tmp)
|
||||||
list(APPEND SRC_LIST_OUT ${new_source_file})
|
list(APPEND SRC_LIST_OUT ${new_source_file})
|
||||||
|
|
||||||
endforeach ()
|
endforeach ()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
enable_language(Fortran)
|
enable_language(Fortran)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
# sources that need to be compiled twice, once with no flags and once with LOWER
|
# sources that need to be compiled twice, once with no flags and once with LOWER
|
||||||
set(UL_SOURCES
|
set(UL_SOURCES
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa
|
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
if (${CORE} STREQUAL "PPC440")
|
if (${CORE} STREQUAL "PPC440")
|
||||||
set(MEMORY memory_qalloc.c)
|
set(MEMORY memory_qalloc.c)
|
||||||
|
|
|
@ -155,7 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifdef DYNAMIC_ARCH
|
#ifdef DYNAMIC_ARCH
|
||||||
gotoblas_t *gotoblas = NULL;
|
gotoblas_t *gotoblas = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
|
|
||||||
#ifndef SMP
|
#ifndef SMP
|
||||||
|
@ -187,25 +186,24 @@ int i,n;
|
||||||
|
|
||||||
#if !defined(__GLIBC_PREREQ)
|
#if !defined(__GLIBC_PREREQ)
|
||||||
return nums;
|
return nums;
|
||||||
#endif
|
#else
|
||||||
#if !__GLIBC_PREREQ(2, 3)
|
#if !__GLIBC_PREREQ(2, 3)
|
||||||
return nums;
|
return nums;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !__GLIBC_PREREQ(2, 7)
|
#if !__GLIBC_PREREQ(2, 7)
|
||||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||||
if (ret!=0) return nums;
|
if (ret!=0) return nums;
|
||||||
n=0;
|
n=0;
|
||||||
#if !__GLIBC_PREREQ(2, 6)
|
#if !__GLIBC_PREREQ(2, 6)
|
||||||
for (i=0;i<nums;i++)
|
for (i=0;i<nums;i++)
|
||||||
if (CPU_ISSET(i,cpusetp)) n++;
|
if (CPU_ISSET(i,cpusetp)) n++;
|
||||||
nums=n;
|
nums=n;
|
||||||
#else
|
#else
|
||||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||||
#endif
|
#endif
|
||||||
return nums;
|
return nums;
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
cpusetp = CPU_ALLOC(nums);
|
cpusetp = CPU_ALLOC(nums);
|
||||||
if (cpusetp == NULL) return nums;
|
if (cpusetp == NULL) return nums;
|
||||||
size = CPU_ALLOC_SIZE(nums);
|
size = CPU_ALLOC_SIZE(nums);
|
||||||
|
@ -214,6 +212,8 @@ int i,n;
|
||||||
nums = CPU_COUNT_S(size,cpusetp);
|
nums = CPU_COUNT_S(size,cpusetp);
|
||||||
CPU_FREE(cpusetp);
|
CPU_FREE(cpusetp);
|
||||||
return nums;
|
return nums;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
|
|
||||||
set(BLAS1_SOURCES
|
set(BLAS1_SOURCES
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake")
|
||||||
|
|
||||||
# Makefile
|
# Makefile
|
||||||
|
|
|
@ -147,57 +147,57 @@ static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" fmov s6, "REG0" \n"
|
" fmov s6, "REG0" \n"
|
||||||
" fmov s7, "REG0" \n"
|
" fmov s7, "REG0" \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lasum_kernel_S_BEGIN \n"
|
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lasum_kernel_F_BEGIN: \n"
|
"1: //asum_kernel_F_BEGIN: \n"
|
||||||
" asr "J", "N", #5 \n"
|
" asr "J", "N", #5 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lasum_kernel_F1 \n"
|
" beq 3f //asum_kernel_F1 \n"
|
||||||
|
|
||||||
".Lasum_kernel_F32: \n"
|
"2: //asum_kernel_F32: \n"
|
||||||
" "KERNEL_F32" \n"
|
" "KERNEL_F32" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F32 \n"
|
" bne 2b //asum_kernel_F32 \n"
|
||||||
" "KERNEL_F32_FINALIZE" \n"
|
" "KERNEL_F32_FINALIZE" \n"
|
||||||
|
|
||||||
".Lasum_kernel_F1: \n"
|
"3: //asum_kernel_F1: \n"
|
||||||
" ands "J", "N", #31 \n"
|
" ands "J", "N", #31 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_F10: \n"
|
"4: //asum_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F10 \n"
|
" bne 4b //asum_kernel_F10 \n"
|
||||||
" b .Lasum_kernel_L999 \n"
|
" b 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S_BEGIN: \n"
|
"5: //asum_kernel_S_BEGIN: \n"
|
||||||
" "INIT_S" \n"
|
" "INIT_S" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lasum_kernel_S1 \n"
|
" ble 7f //asum_kernel_S1 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S4: \n"
|
"6: //asum_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S4 \n"
|
" bne 6b //asum_kernel_S4 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S1: \n"
|
"7: //asum_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S10: \n"
|
"8: //asum_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S10 \n"
|
" bne 8b //asum_kernel_S10 \n"
|
||||||
|
|
||||||
".Lasum_kernel_L999: \n"
|
"9: //asum_kernel_L999: \n"
|
||||||
" fmov %[ASUM_], "SUMFD" \n"
|
" fmov %[ASUM_], "SUMFD" \n"
|
||||||
|
|
||||||
: [ASUM_] "=r" (asum) //%0
|
: [ASUM_] "=r" (asum) //%0
|
||||||
|
|
|
@ -90,62 +90,62 @@ static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_
|
||||||
" mov "Y", %[Y_] \n"
|
" mov "Y", %[Y_] \n"
|
||||||
" mov "INC_Y", %[INCY_] \n"
|
" mov "INC_Y", %[INCY_] \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lcopy_kernel_L999 \n"
|
" ble 8f //copy_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lcopy_kernel_S_BEGIN \n"
|
" bne 4f //copy_kernel_S_BEGIN \n"
|
||||||
" cmp "INC_Y", #1 \n"
|
" cmp "INC_Y", #1 \n"
|
||||||
" bne .Lcopy_kernel_S_BEGIN \n"
|
" bne 4f //copy_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lcopy_kernel_F_BEGIN: \n"
|
"// .Lcopy_kernel_F_BEGIN: \n"
|
||||||
" "INIT" \n"
|
" "INIT" \n"
|
||||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lcopy_kernel_F1 \n"
|
" beq 2f //copy_kernel_F1 \n"
|
||||||
" .align 5 \n"
|
" .align 5 \n"
|
||||||
|
|
||||||
".Lcopy_kernel_F: \n"
|
"1: //copy_kernel_F: \n"
|
||||||
" "KERNEL_F" \n"
|
" "KERNEL_F" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lcopy_kernel_F \n"
|
" bne 1b //copy_kernel_F \n"
|
||||||
|
|
||||||
".Lcopy_kernel_F1: \n"
|
"2: //copy_kernel_F1: \n"
|
||||||
#if defined(COMPLEX) && defined(DOUBLE)
|
#if defined(COMPLEX) && defined(DOUBLE)
|
||||||
" b .Lcopy_kernel_L999 \n"
|
" b 8f //copy_kernel_L999 \n"
|
||||||
#else
|
#else
|
||||||
" ands "J", "N", #"N_REM_MASK" \n"
|
" ands "J", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Lcopy_kernel_L999 \n"
|
" ble 8f //copy_kernel_L999 \n"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
".Lcopy_kernel_F10: \n"
|
"3: //copy_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lcopy_kernel_F10 \n"
|
" bne 3b //copy_kernel_F10 \n"
|
||||||
" b .Lcopy_kernel_L999 \n"
|
" b 8f //copy_kernel_L999 \n"
|
||||||
|
|
||||||
".Lcopy_kernel_S_BEGIN: \n"
|
"4: //copy_kernel_S_BEGIN: \n"
|
||||||
" "INIT" \n"
|
" "INIT" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lcopy_kernel_S1 \n"
|
" ble 6f //copy_kernel_S1 \n"
|
||||||
|
|
||||||
".Lcopy_kernel_S4: \n"
|
"5: //copy_kernel_S4: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lcopy_kernel_S4 \n"
|
" bne 5b //copy_kernel_S4 \n"
|
||||||
|
|
||||||
".Lcopy_kernel_S1: \n"
|
"6: //copy_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lcopy_kernel_L999 \n"
|
" ble 8f //copy_kernel_L999 \n"
|
||||||
|
|
||||||
".Lcopy_kernel_S10: \n"
|
"7: //copy_kernel_S10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lcopy_kernel_S10 \n"
|
" bne 7b //copy_kernel_S10 \n"
|
||||||
|
|
||||||
".Lcopy_kernel_L999: \n"
|
"8: //copy_kernel_L999: \n"
|
||||||
|
|
||||||
:
|
:
|
||||||
: [N_] "r" (n), //%1
|
: [N_] "r" (n), //%1
|
||||||
|
|
|
@ -141,58 +141,58 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" fmov d6, "REG0" \n"
|
" fmov d6, "REG0" \n"
|
||||||
" fmov d7, "REG0" \n"
|
" fmov d7, "REG0" \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lasum_kernel_S_BEGIN \n"
|
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lasum_kernel_F_BEGIN: \n"
|
"1: //asum_kernel_F_BEGIN: \n"
|
||||||
" asr "J", "N", #5 \n"
|
" asr "J", "N", #5 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lasum_kernel_F1 \n"
|
" beq 3f //asum_kernel_F1 \n"
|
||||||
|
|
||||||
".align 5 \n"
|
".align 5 \n"
|
||||||
".Lasum_kernel_F32: \n"
|
"2: //asum_kernel_F32: \n"
|
||||||
" "KERNEL_F32" \n"
|
" "KERNEL_F32" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F32 \n"
|
" bne 2b //asum_kernel_F32 \n"
|
||||||
" "KERNEL_F32_FINALIZE" \n"
|
" "KERNEL_F32_FINALIZE" \n"
|
||||||
|
|
||||||
".Lasum_kernel_F1: \n"
|
"3: //asum_kernel_F1: \n"
|
||||||
" ands "J", "N", #31 \n"
|
" ands "J", "N", #31 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_F10: \n"
|
"4: //asum_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F10 \n"
|
" bne 4b //asum_kernel_F10 \n"
|
||||||
" b .Lasum_kernel_L999 \n"
|
" b 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S_BEGIN: \n"
|
"5: //asum_kernel_S_BEGIN: \n"
|
||||||
" "INIT_S" \n"
|
" "INIT_S" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lasum_kernel_S1 \n"
|
" ble 7f //asum_kernel_S1 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S4: \n"
|
"6: //asum_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S4 \n"
|
" bne 6b //asum_kernel_S4 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S1: \n"
|
"7: //asum_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S10: \n"
|
"8: //asum_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S10 \n"
|
" bne 8b //asum_kernel_S10 \n"
|
||||||
|
|
||||||
".Lasum_kernel_L999: \n"
|
"9: //asum_kernel_L999: \n"
|
||||||
" fmov %[ASUM_], "SUMF" \n"
|
" fmov %[ASUM_], "SUMF" \n"
|
||||||
|
|
||||||
: [ASUM_] "=r" (asum) //%0
|
: [ASUM_] "=r" (asum) //%0
|
||||||
|
|
|
@ -291,61 +291,61 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
|
||||||
" fmov d6, xzr \n"
|
" fmov d6, xzr \n"
|
||||||
" fmov d7, xzr \n"
|
" fmov d7, xzr \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Ldot_kernel_L999 \n"
|
" ble 9f //dot_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Ldot_kernel_S_BEGIN \n"
|
" bne 5f //dot_kernel_S_BEGIN \n"
|
||||||
" cmp "INC_Y", #1 \n"
|
" cmp "INC_Y", #1 \n"
|
||||||
" bne .Ldot_kernel_S_BEGIN \n"
|
" bne 5f //dot_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Ldot_kernel_F_BEGIN: \n"
|
"1: //dot_kernel_F_BEGIN: \n"
|
||||||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
||||||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
||||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Ldot_kernel_F1 \n"
|
" beq 3f //dot_kernel_F1 \n"
|
||||||
|
|
||||||
" .align 5 \n"
|
" .align 5 \n"
|
||||||
".Ldot_kernel_F: \n"
|
"2: //dot_kernel_F: \n"
|
||||||
" "KERNEL_F" \n"
|
" "KERNEL_F" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_F \n"
|
" bne 2b //dot_kernel_F \n"
|
||||||
" "KERNEL_F_FINALIZE" \n"
|
" "KERNEL_F_FINALIZE" \n"
|
||||||
|
|
||||||
".Ldot_kernel_F1: \n"
|
"3: //dot_kernel_F1: \n"
|
||||||
" ands "J", "N", #"N_REM_MASK" \n"
|
" ands "J", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Ldot_kernel_L999 \n"
|
" ble 9f //dot_kernel_L999 \n"
|
||||||
|
|
||||||
".Ldot_kernel_F10: \n"
|
"4: //dot_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_F10 \n"
|
" bne 4b //dot_kernel_F10 \n"
|
||||||
" b .Ldot_kernel_L999 \n"
|
" b 9f //dot_kernel_L999 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S_BEGIN: \n"
|
"5: //dot_kernel_S_BEGIN: \n"
|
||||||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
||||||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Ldot_kernel_S1 \n"
|
" ble 7f //dot_kernel_S1 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S4: \n"
|
"6: //dot_kernel_S4: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_S4 \n"
|
" bne 6b //dot_kernel_S4 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S1: \n"
|
"7: //dot_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Ldot_kernel_L999 \n"
|
" ble 9f //dot_kernel_L999 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S10: \n"
|
"8: //dot_kernel_S10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_S10 \n"
|
" bne 8b //dot_kernel_S10 \n"
|
||||||
|
|
||||||
".Ldot_kernel_L999: \n"
|
"9: //dot_kernel_L999: \n"
|
||||||
" str "DOTF", [%[DOT_]] \n"
|
" str "DOTF", [%[DOT_]] \n"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
|
|
@ -74,33 +74,33 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||||
" fmov "SCALE", xzr \n"
|
" fmov "SCALE", xzr \n"
|
||||||
" fmov "SSQ", #1.0 \n"
|
" fmov "SSQ", #1.0 \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_BEGIN: \n"
|
"1: //nrm2_kernel_F_BEGIN: \n"
|
||||||
" fmov "REGZERO", xzr \n"
|
" fmov "REGZERO", xzr \n"
|
||||||
" fmov "REGONE", #1.0 \n"
|
" fmov "REGONE", #1.0 \n"
|
||||||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
||||||
" mov "J", "N" \n"
|
" mov "J", "N" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lnrm2_kernel_L999 \n"
|
" beq 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_ZERO_SKIP: \n"
|
"2: //nrm2_kernel_F_ZERO_SKIP: \n"
|
||||||
" ldr d4, ["X"] \n"
|
" ldr d4, ["X"] \n"
|
||||||
" fcmp d4, "REGZERO" \n"
|
" fcmp d4, "REGZERO" \n"
|
||||||
" bne .Lnrm2_kernel_F_INIT \n"
|
" bne 3f //nrm2_kernel_F_INIT \n"
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
" ldr d4, ["X", #8] \n"
|
" ldr d4, ["X", #8] \n"
|
||||||
" fcmp d4, "REGZERO" \n"
|
" fcmp d4, "REGZERO" \n"
|
||||||
" bne .Lnrm2_kernel_F_INIT_I \n"
|
" bne 4f //nrm2_kernel_F_INIT_I \n"
|
||||||
#endif
|
#endif
|
||||||
" add "X", "X", "INC_X" \n"
|
" add "X", "X", "INC_X" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" beq .Lnrm2_kernel_L999 \n"
|
" beq 9f //nrm2_kernel_L999 \n"
|
||||||
" b .Lnrm2_kernel_F_ZERO_SKIP \n"
|
" b 2b //nrm2_kernel_F_ZERO_SKIP \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_INIT: \n"
|
"3: //nrm2_kernel_F_INIT: \n"
|
||||||
" ldr d4, ["X"] \n"
|
" ldr d4, ["X"] \n"
|
||||||
" fabs d4, d4 \n"
|
" fabs d4, d4 \n"
|
||||||
" fmax "CUR_MAX", "SCALE", d4 \n"
|
" fmax "CUR_MAX", "SCALE", d4 \n"
|
||||||
|
@ -112,7 +112,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||||
" fadd "SSQ", "SSQ", d4 \n"
|
" fadd "SSQ", "SSQ", d4 \n"
|
||||||
" fmov "SCALE", "CUR_MAX" \n"
|
" fmov "SCALE", "CUR_MAX" \n"
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
".Lnrm2_kernel_F_INIT_I: \n"
|
"4: //nrm2_kernel_F_INIT_I: \n"
|
||||||
" ldr d3, ["X", #8] \n"
|
" ldr d3, ["X", #8] \n"
|
||||||
" fabs d3, d3 \n"
|
" fabs d3, d3 \n"
|
||||||
" fmax "CUR_MAX", "SCALE", d3 \n"
|
" fmax "CUR_MAX", "SCALE", d3 \n"
|
||||||
|
@ -126,16 +126,16 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||||
#endif
|
#endif
|
||||||
" add "X", "X", "INC_X" \n"
|
" add "X", "X", "INC_X" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" beq .Lnrm2_kernel_L999 \n"
|
" beq 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_START: \n"
|
"5: //nrm2_kernel_F_START: \n"
|
||||||
" cmp "INC_X", #"SZ" \n"
|
" cmp "INC_X", #"SZ" \n"
|
||||||
" bne .Lnrm2_kernel_F1 \n"
|
" bne 8f //nrm2_kernel_F1 \n"
|
||||||
" asr "K", "J", #4 \n"
|
" asr "K", "J", #4 \n"
|
||||||
" cmp "K", xzr \n"
|
" cmp "K", xzr \n"
|
||||||
" beq .Lnrm2_kernel_F1 \n"
|
" beq 8f //nrm2_kernel_F1 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F: \n"
|
"6: //nrm2_kernel_F: \n"
|
||||||
" ldp q16, q17, ["X"] \n"
|
" ldp q16, q17, ["X"] \n"
|
||||||
" ldp q18, q19, ["X", #32] \n"
|
" ldp q18, q19, ["X", #32] \n"
|
||||||
" ldp q20, q21, ["X", #64] \n"
|
" ldp q20, q21, ["X", #64] \n"
|
||||||
|
@ -255,13 +255,13 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||||
" fmov "SCALE", "CUR_MAX" \n"
|
" fmov "SCALE", "CUR_MAX" \n"
|
||||||
#endif
|
#endif
|
||||||
" subs "K", "K", #1 \n"
|
" subs "K", "K", #1 \n"
|
||||||
" bne .Lnrm2_kernel_F \n"
|
" bne 6b //nrm2_kernel_F \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_DONE: \n"
|
"7: //nrm2_kernel_F_DONE: \n"
|
||||||
" ands "J", "J", #15 \n"
|
" ands "J", "J", #15 \n"
|
||||||
" beq .Lnrm2_kernel_L999 \n"
|
" beq 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F1: \n"
|
"8: //nrm2_kernel_F1: \n"
|
||||||
" ldr d4, ["X"] \n"
|
" ldr d4, ["X"] \n"
|
||||||
" fabs d4, d4 \n"
|
" fabs d4, d4 \n"
|
||||||
" fmax "CUR_MAX", "SCALE", d4 \n"
|
" fmax "CUR_MAX", "SCALE", d4 \n"
|
||||||
|
@ -286,9 +286,9 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||||
#endif
|
#endif
|
||||||
" add "X", "X", "INC_X" \n"
|
" add "X", "X", "INC_X" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lnrm2_kernel_F1 \n"
|
" bne 8b //nrm2_kernel_F1 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_L999: \n"
|
"9: //nrm2_kernel_L999: \n"
|
||||||
" str "SSQ", [%[SSQ_]] \n"
|
" str "SSQ", [%[SSQ_]] \n"
|
||||||
" str "SCALE", [%[SCALE_]] \n"
|
" str "SCALE", [%[SCALE_]] \n"
|
||||||
|
|
||||||
|
|
|
@ -208,7 +208,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
static BLASLONG __attribute__((noinline)) iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
{
|
{
|
||||||
BLASLONG index = 0;
|
BLASLONG index = 0;
|
||||||
|
|
||||||
|
@ -220,72 +220,72 @@ static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" mov "INC_X", %[INCX_] \n"
|
" mov "INC_X", %[INCX_] \n"
|
||||||
|
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Liamax_kernel_zero \n"
|
" ble 10f //iamax_kernel_zero \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Liamax_kernel_zero \n"
|
" ble 10f //iamax_kernel_zero \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Liamax_kernel_S_BEGIN \n"
|
" bne 5f //iamax_kernel_S_BEGIN \n"
|
||||||
" mov x7, "X" \n"
|
" mov x7, "X" \n"
|
||||||
|
|
||||||
".Liamax_kernel_F_BEGIN: \n"
|
"1: //iamax_kernel_F_BEGIN: \n"
|
||||||
" "INIT" \n"
|
" "INIT" \n"
|
||||||
" subs "N", "N", #1 \n"
|
" subs "N", "N", #1 \n"
|
||||||
" ble .Liamax_kernel_L999 \n"
|
" ble 9f //iamax_kernel_L999 \n"
|
||||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Liamax_kernel_F1 \n"
|
" beq 3f //iamax_kernel_F1 \n"
|
||||||
" add "Z", "Z", #1 \n"
|
" add "Z", "Z", #1 \n"
|
||||||
|
|
||||||
".Liamax_kernel_F: \n"
|
"2: //iamax_kernel_F: \n"
|
||||||
" "KERNEL_F" \n"
|
" "KERNEL_F" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Liamax_kernel_F \n"
|
" bne 2b //iamax_kernel_F \n"
|
||||||
" "KERNEL_F_FINALIZE" \n"
|
" "KERNEL_F_FINALIZE" \n"
|
||||||
" sub "Z", "Z", #1 \n"
|
" sub "Z", "Z", #1 \n"
|
||||||
|
|
||||||
".Liamax_kernel_F1: \n"
|
"3: //iamax_kernel_F1: \n"
|
||||||
" ands "J", "N", #"N_REM_MASK" \n"
|
" ands "J", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Liamax_kernel_L999 \n"
|
" ble 9f //iamax_kernel_L999 \n"
|
||||||
|
|
||||||
".Liamax_kernel_F10: \n"
|
"4: //iamax_kernel_F10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Liamax_kernel_F10 \n"
|
" bne 4b //iamax_kernel_F10 \n"
|
||||||
" b .Liamax_kernel_L999 \n"
|
" b 9f //iamax_kernel_L999 \n"
|
||||||
|
|
||||||
".Liamax_kernel_S_BEGIN: \n"
|
"5: //iamax_kernel_S_BEGIN: \n"
|
||||||
" "INIT" \n"
|
" "INIT" \n"
|
||||||
" subs "N", "N", #1 \n"
|
" subs "N", "N", #1 \n"
|
||||||
" ble .Liamax_kernel_L999 \n"
|
" ble 9f //iamax_kernel_L999 \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Liamax_kernel_S1 \n"
|
" ble 7f //iamax_kernel_S1 \n"
|
||||||
|
|
||||||
".Liamax_kernel_S4: \n"
|
"6: //iamax_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Liamax_kernel_S4 \n"
|
" bne 6b //iamax_kernel_S4 \n"
|
||||||
|
|
||||||
".Liamax_kernel_S1: \n"
|
"7: //iamax_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Liamax_kernel_L999 \n"
|
" ble 9f //iamax_kernel_L999 \n"
|
||||||
|
|
||||||
".Liamax_kernel_S10: \n"
|
"8: //iamax_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Liamax_kernel_S10 \n"
|
" bne 8b //iamax_kernel_S10 \n"
|
||||||
|
|
||||||
".Liamax_kernel_L999: \n"
|
"9: //iamax_kernel_L999: \n"
|
||||||
" mov x0, "INDEX" \n"
|
" mov x0, "INDEX" \n"
|
||||||
" b .Liamax_kernel_DONE \n"
|
" b 11f //iamax_kernel_DONE \n"
|
||||||
|
|
||||||
".Liamax_kernel_zero: \n"
|
"10: //iamax_kernel_zero: \n"
|
||||||
" mov x0, xzr \n"
|
" mov x0, xzr \n"
|
||||||
|
|
||||||
".Liamax_kernel_DONE: \n"
|
"11: //iamax_kernel_DONE: \n"
|
||||||
" mov %[INDEX_], "INDEX" \n"
|
" mov %[INDEX_], "INDEX" \n"
|
||||||
|
|
||||||
: [INDEX_] "=r" (index) //%0
|
: [INDEX_] "=r" (index) //%0
|
||||||
|
|
|
@ -229,72 +229,72 @@ static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" mov "INC_X", %[INCX_] \n"
|
" mov "INC_X", %[INCX_] \n"
|
||||||
|
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lizamax_kernel_zero \n"
|
" ble 10f //izamax_kernel_zero \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lizamax_kernel_zero \n"
|
" ble 10f //izamax_kernel_zero \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lizamax_kernel_S_BEGIN \n"
|
" bne 5f //izamax_kernel_S_BEGIN \n"
|
||||||
" mov x7, "X" \n"
|
" mov x7, "X" \n"
|
||||||
|
|
||||||
".Lizamax_kernel_F_BEGIN: \n"
|
"1: //izamax_kernel_F_BEGIN: \n"
|
||||||
" "INIT" \n"
|
" "INIT" \n"
|
||||||
" subs "N", "N", #1 \n"
|
" subs "N", "N", #1 \n"
|
||||||
" ble .Lizamax_kernel_L999 \n"
|
" ble 9f //izamax_kernel_L999 \n"
|
||||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lizamax_kernel_F1 \n"
|
" beq 3f //izamax_kernel_F1 \n"
|
||||||
" add "Z", "Z", #1 \n"
|
" add "Z", "Z", #1 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_F: \n"
|
"2: //izamax_kernel_F: \n"
|
||||||
" "KERNEL_F" \n"
|
" "KERNEL_F" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lizamax_kernel_F \n"
|
" bne 2b //izamax_kernel_F \n"
|
||||||
" "KERNEL_F_FINALIZE" \n"
|
" "KERNEL_F_FINALIZE" \n"
|
||||||
" sub "Z", "Z", #1 \n"
|
" sub "Z", "Z", #1 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_F1: \n"
|
"3: //izamax_kernel_F1: \n"
|
||||||
" ands "J", "N", #"N_REM_MASK" \n"
|
" ands "J", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Lizamax_kernel_L999 \n"
|
" ble 9f //izamax_kernel_L999 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_F10: \n"
|
"4: //izamax_kernel_F10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lizamax_kernel_F10 \n"
|
" bne 4b //izamax_kernel_F10 \n"
|
||||||
" b .Lizamax_kernel_L999 \n"
|
" b 9f //izamax_kernel_L999 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_S_BEGIN: \n"
|
"5: //izamax_kernel_S_BEGIN: \n"
|
||||||
" "INIT" \n"
|
" "INIT" \n"
|
||||||
" subs "N", "N", #1 \n"
|
" subs "N", "N", #1 \n"
|
||||||
" ble .Lizamax_kernel_L999 \n"
|
" ble 9f //izamax_kernel_L999 \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lizamax_kernel_S1 \n"
|
" ble 7f //izamax_kernel_S1 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_S4: \n"
|
"6: //izamax_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lizamax_kernel_S4 \n"
|
" bne 6b //izamax_kernel_S4 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_S1: \n"
|
"7: //izamax_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lizamax_kernel_L999 \n"
|
" ble 9f //izamax_kernel_L999 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_S10: \n"
|
"8: //izamax_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lizamax_kernel_S10 \n"
|
" bne 8b //izamax_kernel_S10 \n"
|
||||||
|
|
||||||
".Lizamax_kernel_L999: \n"
|
"9: //izamax_kernel_L999: \n"
|
||||||
" mov x0, "INDEX" \n"
|
" mov x0, "INDEX" \n"
|
||||||
" b .Lizamax_kernel_DONE \n"
|
" b 11f //izamax_kernel_DONE \n"
|
||||||
|
|
||||||
".Lizamax_kernel_zero: \n"
|
"10: //izamax_kernel_zero: \n"
|
||||||
" mov x0, xzr \n"
|
" mov x0, xzr \n"
|
||||||
|
|
||||||
".Lizamax_kernel_DONE: \n"
|
"11: //izamax_kernel_DONE: \n"
|
||||||
" mov %[INDEX_], "INDEX" \n"
|
" mov %[INDEX_], "INDEX" \n"
|
||||||
|
|
||||||
: [INDEX_] "=r" (index) //%0
|
: [INDEX_] "=r" (index) //%0
|
||||||
|
|
|
@ -143,58 +143,58 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" fmov s6, "REG0" \n"
|
" fmov s6, "REG0" \n"
|
||||||
" fmov s7, "REG0" \n"
|
" fmov s7, "REG0" \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lasum_kernel_S_BEGIN \n"
|
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lasum_kernel_F_BEGIN: \n"
|
"1: //asum_kernel_F_BEGIN: \n"
|
||||||
" asr "J", "N", #6 \n"
|
" asr "J", "N", #6 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lasum_kernel_F1 \n"
|
" beq 3f //asum_kernel_F1 \n"
|
||||||
|
|
||||||
".align 5 \n"
|
".align 5 \n"
|
||||||
".Lasum_kernel_F64: \n"
|
"2: //asum_kernel_F64: \n"
|
||||||
" "KERNEL_F64" \n"
|
" "KERNEL_F64" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F64 \n"
|
" bne 2b //asum_kernel_F64 \n"
|
||||||
" "KERNEL_F64_FINALIZE" \n"
|
" "KERNEL_F64_FINALIZE" \n"
|
||||||
|
|
||||||
".Lasum_kernel_F1: \n"
|
"3: //asum_kernel_F1: \n"
|
||||||
" ands "J", "N", #63 \n"
|
" ands "J", "N", #63 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_F10: \n"
|
"4: //asum_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F10 \n"
|
" bne 4b //asum_kernel_F10 \n"
|
||||||
" b .Lasum_kernel_L999 \n"
|
" b 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S_BEGIN: \n"
|
"5: //asum_kernel_S_BEGIN: \n"
|
||||||
" "INIT_S" \n"
|
" "INIT_S" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lasum_kernel_S1 \n"
|
" ble 7f //asum_kernel_S1 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S4: \n"
|
"6: //asum_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S4 \n"
|
" bne 6b //asum_kernel_S4 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S1: \n"
|
"7: //asum_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S10: \n"
|
"8: //asum_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S10 \n"
|
" bne 8b //asum_kernel_S10 \n"
|
||||||
|
|
||||||
".Lasum_kernel_L999: \n"
|
"9: //asum_kernel_L999: \n"
|
||||||
" fmov %[ASUM_], "SUMFD" \n"
|
" fmov %[ASUM_], "SUMFD" \n"
|
||||||
|
|
||||||
: [ASUM_] "=r" (asum) //%0
|
: [ASUM_] "=r" (asum) //%0
|
||||||
|
|
|
@ -227,58 +227,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" fmov d6, xzr \n"
|
" fmov d6, xzr \n"
|
||||||
" fmov d7, xzr \n"
|
" fmov d7, xzr \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lnrm2_kernel_S_BEGIN \n"
|
" bne 5f //nrm2_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_BEGIN: \n"
|
"1: //nrm2_kernel_F_BEGIN: \n"
|
||||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lnrm2_kernel_S_BEGIN \n"
|
" beq 5f //nrm2_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
" .align 5 \n"
|
" .align 5 \n"
|
||||||
".Lnrm2_kernel_F: \n"
|
"2: //nrm2_kernel_F: \n"
|
||||||
" "KERNEL_F" \n"
|
" "KERNEL_F" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lnrm2_kernel_F \n"
|
" bne 2b //nrm2_kernel_F \n"
|
||||||
" "KERNEL_F_FINALIZE" \n"
|
" "KERNEL_F_FINALIZE" \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F1: \n"
|
"3: //nrm2_kernel_F1: \n"
|
||||||
" ands "J", "N", #"N_REM_MASK" \n"
|
" ands "J", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F10: \n"
|
"4: //nrm2_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lnrm2_kernel_F10 \n"
|
" bne 4b //nrm2_kernel_F10 \n"
|
||||||
" b .Lnrm2_kernel_L999 \n"
|
" b 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_S_BEGIN: \n"
|
"5: //nrm2_kernel_S_BEGIN: \n"
|
||||||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lnrm2_kernel_S1 \n"
|
" ble 7f //nrm2_kernel_S1 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_S4: \n"
|
"6: //nrm2_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lnrm2_kernel_S4 \n"
|
" bne 6b //nrm2_kernel_S4 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_S1: \n"
|
"7: //nrm2_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_S10: \n"
|
"8: //nrm2_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lnrm2_kernel_S10 \n"
|
" bne 8b //nrm2_kernel_S10 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_L999: \n"
|
"9: //nrm2_kernel_L999: \n"
|
||||||
" "KERNEL_FINALIZE" \n"
|
" "KERNEL_FINALIZE" \n"
|
||||||
" fmov %[RET_], "SSQD" \n"
|
" fmov %[RET_], "SSQD" \n"
|
||||||
|
|
||||||
|
|
|
@ -143,58 +143,58 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" fmov d6, "REG0" \n"
|
" fmov d6, "REG0" \n"
|
||||||
" fmov d7, "REG0" \n"
|
" fmov d7, "REG0" \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Lasum_kernel_S_BEGIN \n"
|
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lasum_kernel_F_BEGIN: \n"
|
"1: //asum_kernel_F_BEGIN: \n"
|
||||||
" asr "J", "N", #4 \n"
|
" asr "J", "N", #4 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Lasum_kernel_F1 \n"
|
" beq 3f //asum_kernel_F1 \n"
|
||||||
|
|
||||||
".align 5 \n"
|
".align 5 \n"
|
||||||
".Lasum_kernel_F16: \n"
|
"2: //asum_kernel_F16: \n"
|
||||||
" "KERNEL_F16" \n"
|
" "KERNEL_F16" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F16 \n"
|
" bne 2b //asum_kernel_F16 \n"
|
||||||
" "KERNEL_F16_FINALIZE" \n"
|
" "KERNEL_F16_FINALIZE" \n"
|
||||||
|
|
||||||
".Lasum_kernel_F1: \n"
|
"3: //asum_kernel_F1: \n"
|
||||||
" ands "J", "N", #15 \n"
|
" ands "J", "N", #15 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_F10: \n"
|
"4: //asum_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_F10 \n"
|
" bne 4b //asum_kernel_F10 \n"
|
||||||
" b .Lasum_kernel_L999 \n"
|
" b 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S_BEGIN: \n"
|
"5: //asum_kernel_S_BEGIN: \n"
|
||||||
" "INIT_S" \n"
|
" "INIT_S" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Lasum_kernel_S1 \n"
|
" ble 7f //asum_kernel_S1 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S4: \n"
|
"6: //asum_kernel_S4: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S4 \n"
|
" bne 6b //asum_kernel_S4 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S1: \n"
|
"7: //asum_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Lasum_kernel_L999 \n"
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
".Lasum_kernel_S10: \n"
|
"8: //asum_kernel_S10: \n"
|
||||||
" "KERNEL_S1" \n"
|
" "KERNEL_S1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Lasum_kernel_S10 \n"
|
" bne 8b //asum_kernel_S10 \n"
|
||||||
|
|
||||||
".Lasum_kernel_L999: \n"
|
"9: //asum_kernel_L999: \n"
|
||||||
" fmov %[ASUM_], "SUMF" \n"
|
" fmov %[ASUM_], "SUMF" \n"
|
||||||
|
|
||||||
: [ASUM_] "=r" (asum) //%0
|
: [ASUM_] "=r" (asum) //%0
|
||||||
|
|
|
@ -218,61 +218,61 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
|
||||||
" fmov d6, xzr \n"
|
" fmov d6, xzr \n"
|
||||||
" fmov d7, xzr \n"
|
" fmov d7, xzr \n"
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble .Ldot_kernel_L999 \n"
|
" ble 9f //dot_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne .Ldot_kernel_S_BEGIN \n"
|
" bne 5f //dot_kernel_S_BEGIN \n"
|
||||||
" cmp "INC_Y", #1 \n"
|
" cmp "INC_Y", #1 \n"
|
||||||
" bne .Ldot_kernel_S_BEGIN \n"
|
" bne 5f //dot_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Ldot_kernel_F_BEGIN: \n"
|
"1: //dot_kernel_F_BEGIN: \n"
|
||||||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
||||||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
||||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" beq .Ldot_kernel_F1 \n"
|
" beq 3f //dot_kernel_F1 \n"
|
||||||
|
|
||||||
" .align 5 \n"
|
" .align 5 \n"
|
||||||
".Ldot_kernel_F: \n"
|
"2: //dot_kernel_F: \n"
|
||||||
" "KERNEL_F" \n"
|
" "KERNEL_F" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_F \n"
|
" bne 2b //dot_kernel_F \n"
|
||||||
" "KERNEL_F_FINALIZE" \n"
|
" "KERNEL_F_FINALIZE" \n"
|
||||||
|
|
||||||
".Ldot_kernel_F1: \n"
|
"3: //dot_kernel_F1: \n"
|
||||||
" ands "J", "N", #"N_REM_MASK" \n"
|
" ands "J", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Ldot_kernel_L999 \n"
|
" ble 9f //dot_kernel_L999 \n"
|
||||||
|
|
||||||
".Ldot_kernel_F10: \n"
|
"4: //dot_kernel_F10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_F10 \n"
|
" bne 4b //dot_kernel_F10 \n"
|
||||||
" b .Ldot_kernel_L999 \n"
|
" b 9f //dot_kernel_L999 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S_BEGIN: \n"
|
"5: //dot_kernel_S_BEGIN: \n"
|
||||||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
||||||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
||||||
" asr "J", "N", #2 \n"
|
" asr "J", "N", #2 \n"
|
||||||
" cmp "J", xzr \n"
|
" cmp "J", xzr \n"
|
||||||
" ble .Ldot_kernel_S1 \n"
|
" ble 7f //dot_kernel_S1 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S4: \n"
|
"6: //dot_kernel_S4: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_S4 \n"
|
" bne 6b //dot_kernel_S4 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S1: \n"
|
"7: //dot_kernel_S1: \n"
|
||||||
" ands "J", "N", #3 \n"
|
" ands "J", "N", #3 \n"
|
||||||
" ble .Ldot_kernel_L999 \n"
|
" ble 9f //dot_kernel_L999 \n"
|
||||||
|
|
||||||
".Ldot_kernel_S10: \n"
|
"8: //dot_kernel_S10: \n"
|
||||||
" "KERNEL_F1" \n"
|
" "KERNEL_F1" \n"
|
||||||
" subs "J", "J", #1 \n"
|
" subs "J", "J", #1 \n"
|
||||||
" bne .Ldot_kernel_S10 \n"
|
" bne 8b //dot_kernel_S10 \n"
|
||||||
|
|
||||||
".Ldot_kernel_L999: \n"
|
"9: //dot_kernel_L999: \n"
|
||||||
" str "DOTF", [%[DOTR_]] \n"
|
" str "DOTF", [%[DOTR_]] \n"
|
||||||
" str "DOTI", [%[DOTI_]] \n"
|
" str "DOTI", [%[DOTI_]] \n"
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
|
|
||||||
set(LAPACK_SOURCES
|
set(LAPACK_SOURCES
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
enable_language(Fortran)
|
enable_language(Fortran)
|
||||||
|
|
||||||
|
@ -35,4 +36,4 @@ add_test(NAME "${float_type}blas2"
|
||||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
|
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
|
||||||
add_test(NAME "${float_type}blas3"
|
add_test(NAME "${float_type}blas3"
|
||||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
|
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
include_directories(${PROJECT_SOURCE_DIR})
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
set(OpenBLAS_utest_src
|
set(OpenBLAS_utest_src
|
||||||
utest_main.c
|
utest_main.c
|
||||||
|
@ -39,4 +40,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin}
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})
|
add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})
|
||||||
|
|
Loading…
Reference in New Issue