Merge remote-tracking branch 'upstream/develop' into develop

This commit is contained in:
Isuru Fernando 2017-08-04 07:57:55 +05:30
commit d9346930dd
29 changed files with 415 additions and 323 deletions

1
.gitignore vendored
View File

@ -5,6 +5,7 @@
*.def *.def
*.o *.o
*.out *.out
*.tmp
lapack-3.1.1 lapack-3.1.1
lapack-3.1.1.tgz lapack-3.1.1.tgz
lapack-3.4.1 lapack-3.4.1

View File

@ -1,4 +1,119 @@
# XXX: Precise is already deprecated, new default is Trusty.
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
dist: precise
sudo: false
language: c language: c
compiler: gcc
jobs:
include:
- &test-ubuntu
stage: test
addons:
apt:
packages:
- gfortran
before_script: &common-before
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
script:
- set -e
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-ubuntu
addons:
apt:
packages:
- gcc-multilib
- gfortran-multilib
env:
- TARGET_BOX=LINUX32
- BTYPE="BINARY=32"
- stage: test
addons:
apt:
packages:
- binutils-mingw-w64-x86-64
- gcc-mingw-w64-x86-64
- gfortran-mingw-w64-x86-64
before_script: *common-before
script:
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=WIN64
- BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
# Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
# These jobs needs sudo, so Travis runs them on VM-based infrastructure
# which is slower than container-based infrastructure used for jobs
# that don't require sudo.
- &test-alpine
stage: test
dist: trusty
sudo: true
language: minimal
before_install:
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
install:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
before_script: *common-before
script:
- set -e
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
- alpine make -C test $COMMON_FLAGS $BTYPE
- alpine make -C ctest $COMMON_FLAGS $BTYPE
- alpine make -C utest $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64"
# XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
# so it's "allowed to fail" for now (see allow_failures).
- &test-alpine-openmp
<<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 INTERFACE64=1"
# Build with the same flags as Alpine do in OpenBLAS package.
- <<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
allow_failures:
- <<: *test-alpine-openmp
# whitelist
branches:
only:
- master
- develop
notifications: notifications:
webhooks: webhooks:
@ -7,32 +122,3 @@ notifications:
on_success: change # options: [always|never|change] default: always on_success: change # options: [always|never|change] default: always
on_failure: always # options: [always|never|change] default: always on_failure: always # options: [always|never|change] default: always
on_start: never # options: [always|never|change] default: always on_start: never # options: [always|never|change] default: always
compiler:
- gcc
env:
- TARGET_BOX=LINUX64 BTYPE="BINARY=64"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1"
- TARGET_BOX=LINUX32 BTYPE="BINARY=32"
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq gfortran
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
script:
- set -e
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
# whitelist
branches:
only:
- master
- develop

View File

@ -231,43 +231,33 @@ install(TARGETS ${OpenBLAS_LIBNAME}
# Install include files # Install include files
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX}) set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
ADD_CUSTOM_COMMAND(
OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
)
ADD_CUSTOM_TARGET(genconfig execute_process(COMMAND ${GENCONFIG_BIN}
ALL ${CMAKE_CURRENT_SOURCE_DIR}/config.h
DEPENDS openblas_config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h
) OUTPUT_VARIABLE OPENBLAS_CONFIG_H_CONTENTS)
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
file(WRITE ${CMAKE_BINARY_DIR}/openblas_config.tmp "${OPENBLAS_CONFIG_H_CONTENTS}")
configure_file(${CMAKE_BINARY_DIR}/openblas_config.tmp ${CMAKE_BINARY_DIR}/openblas_config.h COPYONLY)
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(genf77blas file(WRITE ${CMAKE_BINARY_DIR}/f77blas.h "")
ALL file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n")
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h file(READ ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h COMMON_INTERFACE_H_CONTENTS)
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "${COMMON_INTERFACE_H_CONTENTS}")
) file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "#endif")
add_dependencies(genf77blas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NO_CBLAS) if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(gencblas file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
ALL string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
)
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif() endif()
if(NOT NO_LAPACKE) if(NOT NO_LAPACKE)
@ -277,7 +267,7 @@ if(NOT NO_LAPACKE)
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
ADD_CUSTOM_TARGET(genlapacke ADD_CUSTOM_TARGET(genlapacke
COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
) )
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif() endif()

View File

@ -87,7 +87,7 @@ endif ()
string(TOUPPER ${ARCH} UC_ARCH) string(TOUPPER ${ARCH} UC_ARCH)
file(WRITE ${TARGET_CONF} file(WRITE ${TARGET_CONF_TEMP}
"#define OS_${HOST_OS}\t1\n" "#define OS_${HOST_OS}\t1\n"
"#define ARCH_${UC_ARCH}\t1\n" "#define ARCH_${UC_ARCH}\t1\n"
"#define C_${COMPILER_ID}\t1\n" "#define C_${COMPILER_ID}\t1\n"
@ -95,7 +95,7 @@ file(WRITE ${TARGET_CONF}
"#define FUNDERSCORE\t${FU}\n") "#define FUNDERSCORE\t${FU}\n")
if (${HOST_OS} STREQUAL "WINDOWSSTORE") if (${HOST_OS} STREQUAL "WINDOWSSTORE")
file(APPEND ${TARGET_CONF} file(APPEND ${TARGET_CONF_TEMP}
"#define OS_WINNT\t1\n") "#define OS_WINNT\t1\n")
endif () endif ()

View File

@ -44,7 +44,7 @@ if (NOT ONLY_CBLAS)
# TODO: set FEXTRALIB flags a la f_check? # TODO: set FEXTRALIB flags a la f_check?
set(BU "_") set(BU "_")
file(APPEND ${TARGET_CONF} file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n" "#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n" "#define NEEDBUNDERSCORE 1\n"
"#define NEED2UNDERSCORES 0\n") "#define NEED2UNDERSCORES 0\n")
@ -56,7 +56,7 @@ else ()
set(NO_FBLAS 1) set(NO_FBLAS 1)
#set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler #set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler
set(BU "_") set(BU "_")
file(APPEND ${TARGET_CONF} file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n" "#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n") "#define NEEDBUNDERSCORE 1\n")
endif() endif()

View File

@ -2391,6 +2391,6 @@ foreach (Utils_FILE ${Utils_SRC})
endforeach () endforeach ()
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h") configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
include_directories(${lapacke_include_dir}) include_directories(${lapacke_include_dir})
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")

View File

@ -51,6 +51,7 @@ else()
set(TARGET_CONF "config.h") set(TARGET_CONF "config.h")
endif () endif ()
set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp")
include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
if (NOT NOFORTRAN) if (NOT NOFORTRAN)
@ -79,10 +80,11 @@ endif ()
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR}) file(MAKE_DIRECTORY ${GETARCH_DIR})
configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR} try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC} SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR}
OUTPUT_VARIABLE GETARCH_LOG OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
) )
@ -100,16 +102,17 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE G
message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}")
# append config data from getarch to the TARGET file and read in CMake vars # append config data from getarch to the TARGET file and read in CMake vars
file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) file(APPEND ${TARGET_CONF_TEMP} ${GETARCH_CONF_OUT})
ParseGetArchVars(${GETARCH_MAKE_OUT}) ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR}) file(MAKE_DIRECTORY ${GETARCH2_DIR})
configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR} try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR}
OUTPUT_VARIABLE GETARCH2_LOG OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
) )
@ -124,7 +127,8 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
# append config data from getarch_2nd to the TARGET file and read in CMake vars # append config data from getarch_2nd to the TARGET file and read in CMake vars
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT})
configure_file(${TARGET_CONF_TEMP} ${PROJECT_BINARY_DIR}/${TARGET_CONF} COPYONLY)
ParseGetArchVars(${GETARCH2_MAKE_OUT}) ParseGetArchVars(${GETARCH2_MAKE_OUT})
# compile get_config_h # compile get_config_h
@ -144,4 +148,4 @@ if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
if (NOT ${GEN_CONFIG_H_RESULT}) if (NOT ${GEN_CONFIG_H_RESULT})
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
endif () endif ()
endif () endif ()

View File

@ -234,7 +234,9 @@ function(GenerateNamedObjects sources_in)
string(REPLACE ";" "\n#define " define_source "${obj_defines}") string(REPLACE ";" "\n#define " define_source "${obj_defines}")
string(REPLACE "=" " " define_source "${define_source}") string(REPLACE "=" " " define_source "${define_source}")
file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"") file(WRITE ${new_source_file}.tmp "#define ${define_source}\n#include \"${old_source_file}\"")
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
file(REMOVE ${new_source_file}.tmp)
list(APPEND SRC_LIST_OUT ${new_source_file}) list(APPEND SRC_LIST_OUT ${new_source_file})
endforeach () endforeach ()

View File

@ -1,4 +1,5 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
enable_language(Fortran) enable_language(Fortran)

View File

@ -1,5 +1,6 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
# sources that need to be compiled twice, once with no flags and once with LOWER # sources that need to be compiled twice, once with no flags and once with LOWER
set(UL_SOURCES set(UL_SOURCES

View File

@ -1,4 +1,5 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa

View File

@ -1,4 +1,5 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
if (${CORE} STREQUAL "PPC440") if (${CORE} STREQUAL "PPC440")
set(MEMORY memory_qalloc.c) set(MEMORY memory_qalloc.c)

View File

@ -155,7 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
gotoblas_t *gotoblas = NULL; gotoblas_t *gotoblas = NULL;
#endif #endif
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#ifndef SMP #ifndef SMP
@ -187,25 +186,24 @@ int i,n;
#if !defined(__GLIBC_PREREQ) #if !defined(__GLIBC_PREREQ)
return nums; return nums;
#endif #else
#if !__GLIBC_PREREQ(2, 3) #if !__GLIBC_PREREQ(2, 3)
return nums; return nums;
#endif #endif
#if !__GLIBC_PREREQ(2, 7) #if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) return nums; if (ret!=0) return nums;
n=0; n=0;
#if !__GLIBC_PREREQ(2, 6) #if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++) for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++; if (CPU_ISSET(i,cpusetp)) n++;
nums=n; nums=n;
#else #else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
#endif #endif
return nums; return nums;
#endif #else
cpusetp = CPU_ALLOC(nums); cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums; if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums); size = CPU_ALLOC_SIZE(nums);
@ -214,6 +212,8 @@ int i,n;
nums = CPU_COUNT_S(size,cpusetp); nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp); CPU_FREE(cpusetp);
return nums; return nums;
#endif
#endif
} }
#endif #endif
#endif #endif

View File

@ -1,5 +1,6 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
set(BLAS1_SOURCES set(BLAS1_SOURCES

View File

@ -1,5 +1,6 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake")
# Makefile # Makefile

View File

@ -147,57 +147,57 @@ static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" fmov s6, "REG0" \n" " fmov s6, "REG0" \n"
" fmov s7, "REG0" \n" " fmov s7, "REG0" \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lasum_kernel_S_BEGIN \n" " bne 5f //asum_kernel_S_BEGIN \n"
".Lasum_kernel_F_BEGIN: \n" "1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #5 \n" " asr "J", "N", #5 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lasum_kernel_F1 \n" " beq 3f //asum_kernel_F1 \n"
".Lasum_kernel_F32: \n" "2: //asum_kernel_F32: \n"
" "KERNEL_F32" \n" " "KERNEL_F32" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F32 \n" " bne 2b //asum_kernel_F32 \n"
" "KERNEL_F32_FINALIZE" \n" " "KERNEL_F32_FINALIZE" \n"
".Lasum_kernel_F1: \n" "3: //asum_kernel_F1: \n"
" ands "J", "N", #31 \n" " ands "J", "N", #31 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_F10: \n" "4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F10 \n" " bne 4b //asum_kernel_F10 \n"
" b .Lasum_kernel_L999 \n" " b 9f //asum_kernel_L999 \n"
".Lasum_kernel_S_BEGIN: \n" "5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n" " "INIT_S" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lasum_kernel_S1 \n" " ble 7f //asum_kernel_S1 \n"
".Lasum_kernel_S4: \n" "6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S4 \n" " bne 6b //asum_kernel_S4 \n"
".Lasum_kernel_S1: \n" "7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_S10: \n" "8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S10 \n" " bne 8b //asum_kernel_S10 \n"
".Lasum_kernel_L999: \n" "9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMFD" \n" " fmov %[ASUM_], "SUMFD" \n"
: [ASUM_] "=r" (asum) //%0 : [ASUM_] "=r" (asum) //%0

View File

@ -90,62 +90,62 @@ static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_
" mov "Y", %[Y_] \n" " mov "Y", %[Y_] \n"
" mov "INC_Y", %[INCY_] \n" " mov "INC_Y", %[INCY_] \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lcopy_kernel_L999 \n" " ble 8f //copy_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lcopy_kernel_S_BEGIN \n" " bne 4f //copy_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n" " cmp "INC_Y", #1 \n"
" bne .Lcopy_kernel_S_BEGIN \n" " bne 4f //copy_kernel_S_BEGIN \n"
".Lcopy_kernel_F_BEGIN: \n" "// .Lcopy_kernel_F_BEGIN: \n"
" "INIT" \n" " "INIT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lcopy_kernel_F1 \n" " beq 2f //copy_kernel_F1 \n"
" .align 5 \n" " .align 5 \n"
".Lcopy_kernel_F: \n" "1: //copy_kernel_F: \n"
" "KERNEL_F" \n" " "KERNEL_F" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lcopy_kernel_F \n" " bne 1b //copy_kernel_F \n"
".Lcopy_kernel_F1: \n" "2: //copy_kernel_F1: \n"
#if defined(COMPLEX) && defined(DOUBLE) #if defined(COMPLEX) && defined(DOUBLE)
" b .Lcopy_kernel_L999 \n" " b 8f //copy_kernel_L999 \n"
#else #else
" ands "J", "N", #"N_REM_MASK" \n" " ands "J", "N", #"N_REM_MASK" \n"
" ble .Lcopy_kernel_L999 \n" " ble 8f //copy_kernel_L999 \n"
#endif #endif
".Lcopy_kernel_F10: \n" "3: //copy_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lcopy_kernel_F10 \n" " bne 3b //copy_kernel_F10 \n"
" b .Lcopy_kernel_L999 \n" " b 8f //copy_kernel_L999 \n"
".Lcopy_kernel_S_BEGIN: \n" "4: //copy_kernel_S_BEGIN: \n"
" "INIT" \n" " "INIT" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lcopy_kernel_S1 \n" " ble 6f //copy_kernel_S1 \n"
".Lcopy_kernel_S4: \n" "5: //copy_kernel_S4: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lcopy_kernel_S4 \n" " bne 5b //copy_kernel_S4 \n"
".Lcopy_kernel_S1: \n" "6: //copy_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lcopy_kernel_L999 \n" " ble 8f //copy_kernel_L999 \n"
".Lcopy_kernel_S10: \n" "7: //copy_kernel_S10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lcopy_kernel_S10 \n" " bne 7b //copy_kernel_S10 \n"
".Lcopy_kernel_L999: \n" "8: //copy_kernel_L999: \n"
: :
: [N_] "r" (n), //%1 : [N_] "r" (n), //%1

View File

@ -141,58 +141,58 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" fmov d6, "REG0" \n" " fmov d6, "REG0" \n"
" fmov d7, "REG0" \n" " fmov d7, "REG0" \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lasum_kernel_S_BEGIN \n" " bne 5f //asum_kernel_S_BEGIN \n"
".Lasum_kernel_F_BEGIN: \n" "1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #5 \n" " asr "J", "N", #5 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lasum_kernel_F1 \n" " beq 3f //asum_kernel_F1 \n"
".align 5 \n" ".align 5 \n"
".Lasum_kernel_F32: \n" "2: //asum_kernel_F32: \n"
" "KERNEL_F32" \n" " "KERNEL_F32" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F32 \n" " bne 2b //asum_kernel_F32 \n"
" "KERNEL_F32_FINALIZE" \n" " "KERNEL_F32_FINALIZE" \n"
".Lasum_kernel_F1: \n" "3: //asum_kernel_F1: \n"
" ands "J", "N", #31 \n" " ands "J", "N", #31 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_F10: \n" "4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F10 \n" " bne 4b //asum_kernel_F10 \n"
" b .Lasum_kernel_L999 \n" " b 9f //asum_kernel_L999 \n"
".Lasum_kernel_S_BEGIN: \n" "5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n" " "INIT_S" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lasum_kernel_S1 \n" " ble 7f //asum_kernel_S1 \n"
".Lasum_kernel_S4: \n" "6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S4 \n" " bne 6b //asum_kernel_S4 \n"
".Lasum_kernel_S1: \n" "7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_S10: \n" "8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S10 \n" " bne 8b //asum_kernel_S10 \n"
".Lasum_kernel_L999: \n" "9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMF" \n" " fmov %[ASUM_], "SUMF" \n"
: [ASUM_] "=r" (asum) //%0 : [ASUM_] "=r" (asum) //%0

View File

@ -291,61 +291,61 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
" fmov d6, xzr \n" " fmov d6, xzr \n"
" fmov d7, xzr \n" " fmov d7, xzr \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Ldot_kernel_L999 \n" " ble 9f //dot_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n" " bne 5f //dot_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n" " cmp "INC_Y", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n" " bne 5f //dot_kernel_S_BEGIN \n"
".Ldot_kernel_F_BEGIN: \n" "1: //dot_kernel_F_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Ldot_kernel_F1 \n" " beq 3f //dot_kernel_F1 \n"
" .align 5 \n" " .align 5 \n"
".Ldot_kernel_F: \n" "2: //dot_kernel_F: \n"
" "KERNEL_F" \n" " "KERNEL_F" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_F \n" " bne 2b //dot_kernel_F \n"
" "KERNEL_F_FINALIZE" \n" " "KERNEL_F_FINALIZE" \n"
".Ldot_kernel_F1: \n" "3: //dot_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n" " ands "J", "N", #"N_REM_MASK" \n"
" ble .Ldot_kernel_L999 \n" " ble 9f //dot_kernel_L999 \n"
".Ldot_kernel_F10: \n" "4: //dot_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_F10 \n" " bne 4b //dot_kernel_F10 \n"
" b .Ldot_kernel_L999 \n" " b 9f //dot_kernel_L999 \n"
".Ldot_kernel_S_BEGIN: \n" "5: //dot_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Ldot_kernel_S1 \n" " ble 7f //dot_kernel_S1 \n"
".Ldot_kernel_S4: \n" "6: //dot_kernel_S4: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_S4 \n" " bne 6b //dot_kernel_S4 \n"
".Ldot_kernel_S1: \n" "7: //dot_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Ldot_kernel_L999 \n" " ble 9f //dot_kernel_L999 \n"
".Ldot_kernel_S10: \n" "8: //dot_kernel_S10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_S10 \n" " bne 8b //dot_kernel_S10 \n"
".Ldot_kernel_L999: \n" "9: //dot_kernel_L999: \n"
" str "DOTF", [%[DOT_]] \n" " str "DOTF", [%[DOT_]] \n"
: :

View File

@ -74,33 +74,33 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" fmov "SCALE", xzr \n" " fmov "SCALE", xzr \n"
" fmov "SSQ", #1.0 \n" " fmov "SSQ", #1.0 \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lnrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lnrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_F_BEGIN: \n" "1: //nrm2_kernel_F_BEGIN: \n"
" fmov "REGZERO", xzr \n" " fmov "REGZERO", xzr \n"
" fmov "REGONE", #1.0 \n" " fmov "REGONE", #1.0 \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" mov "J", "N" \n" " mov "J", "N" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lnrm2_kernel_L999 \n" " beq 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_F_ZERO_SKIP: \n" "2: //nrm2_kernel_F_ZERO_SKIP: \n"
" ldr d4, ["X"] \n" " ldr d4, ["X"] \n"
" fcmp d4, "REGZERO" \n" " fcmp d4, "REGZERO" \n"
" bne .Lnrm2_kernel_F_INIT \n" " bne 3f //nrm2_kernel_F_INIT \n"
#if defined(COMPLEX) #if defined(COMPLEX)
" ldr d4, ["X", #8] \n" " ldr d4, ["X", #8] \n"
" fcmp d4, "REGZERO" \n" " fcmp d4, "REGZERO" \n"
" bne .Lnrm2_kernel_F_INIT_I \n" " bne 4f //nrm2_kernel_F_INIT_I \n"
#endif #endif
" add "X", "X", "INC_X" \n" " add "X", "X", "INC_X" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" beq .Lnrm2_kernel_L999 \n" " beq 9f //nrm2_kernel_L999 \n"
" b .Lnrm2_kernel_F_ZERO_SKIP \n" " b 2b //nrm2_kernel_F_ZERO_SKIP \n"
".Lnrm2_kernel_F_INIT: \n" "3: //nrm2_kernel_F_INIT: \n"
" ldr d4, ["X"] \n" " ldr d4, ["X"] \n"
" fabs d4, d4 \n" " fabs d4, d4 \n"
" fmax "CUR_MAX", "SCALE", d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n"
@ -112,7 +112,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" fadd "SSQ", "SSQ", d4 \n" " fadd "SSQ", "SSQ", d4 \n"
" fmov "SCALE", "CUR_MAX" \n" " fmov "SCALE", "CUR_MAX" \n"
#if defined(COMPLEX) #if defined(COMPLEX)
".Lnrm2_kernel_F_INIT_I: \n" "4: //nrm2_kernel_F_INIT_I: \n"
" ldr d3, ["X", #8] \n" " ldr d3, ["X", #8] \n"
" fabs d3, d3 \n" " fabs d3, d3 \n"
" fmax "CUR_MAX", "SCALE", d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n"
@ -126,16 +126,16 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
#endif #endif
" add "X", "X", "INC_X" \n" " add "X", "X", "INC_X" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" beq .Lnrm2_kernel_L999 \n" " beq 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_F_START: \n" "5: //nrm2_kernel_F_START: \n"
" cmp "INC_X", #"SZ" \n" " cmp "INC_X", #"SZ" \n"
" bne .Lnrm2_kernel_F1 \n" " bne 8f //nrm2_kernel_F1 \n"
" asr "K", "J", #4 \n" " asr "K", "J", #4 \n"
" cmp "K", xzr \n" " cmp "K", xzr \n"
" beq .Lnrm2_kernel_F1 \n" " beq 8f //nrm2_kernel_F1 \n"
".Lnrm2_kernel_F: \n" "6: //nrm2_kernel_F: \n"
" ldp q16, q17, ["X"] \n" " ldp q16, q17, ["X"] \n"
" ldp q18, q19, ["X", #32] \n" " ldp q18, q19, ["X", #32] \n"
" ldp q20, q21, ["X", #64] \n" " ldp q20, q21, ["X", #64] \n"
@ -255,13 +255,13 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" fmov "SCALE", "CUR_MAX" \n" " fmov "SCALE", "CUR_MAX" \n"
#endif #endif
" subs "K", "K", #1 \n" " subs "K", "K", #1 \n"
" bne .Lnrm2_kernel_F \n" " bne 6b //nrm2_kernel_F \n"
".Lnrm2_kernel_F_DONE: \n" "7: //nrm2_kernel_F_DONE: \n"
" ands "J", "J", #15 \n" " ands "J", "J", #15 \n"
" beq .Lnrm2_kernel_L999 \n" " beq 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_F1: \n" "8: //nrm2_kernel_F1: \n"
" ldr d4, ["X"] \n" " ldr d4, ["X"] \n"
" fabs d4, d4 \n" " fabs d4, d4 \n"
" fmax "CUR_MAX", "SCALE", d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n"
@ -286,9 +286,9 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
#endif #endif
" add "X", "X", "INC_X" \n" " add "X", "X", "INC_X" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F1 \n" " bne 8b //nrm2_kernel_F1 \n"
".Lnrm2_kernel_L999: \n" "9: //nrm2_kernel_L999: \n"
" str "SSQ", [%[SSQ_]] \n" " str "SSQ", [%[SSQ_]] \n"
" str "SCALE", [%[SCALE_]] \n" " str "SCALE", [%[SCALE_]] \n"

View File

@ -208,7 +208,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
#endif #endif
static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) static BLASLONG __attribute__((noinline)) iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG index = 0; BLASLONG index = 0;
@ -220,72 +220,72 @@ static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" mov "INC_X", %[INCX_] \n" " mov "INC_X", %[INCX_] \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Liamax_kernel_zero \n" " ble 10f //iamax_kernel_zero \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Liamax_kernel_zero \n" " ble 10f //iamax_kernel_zero \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Liamax_kernel_S_BEGIN \n" " bne 5f //iamax_kernel_S_BEGIN \n"
" mov x7, "X" \n" " mov x7, "X" \n"
".Liamax_kernel_F_BEGIN: \n" "1: //iamax_kernel_F_BEGIN: \n"
" "INIT" \n" " "INIT" \n"
" subs "N", "N", #1 \n" " subs "N", "N", #1 \n"
" ble .Liamax_kernel_L999 \n" " ble 9f //iamax_kernel_L999 \n"
" asr "J", "N", #"N_DIV_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Liamax_kernel_F1 \n" " beq 3f //iamax_kernel_F1 \n"
" add "Z", "Z", #1 \n" " add "Z", "Z", #1 \n"
".Liamax_kernel_F: \n" "2: //iamax_kernel_F: \n"
" "KERNEL_F" \n" " "KERNEL_F" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Liamax_kernel_F \n" " bne 2b //iamax_kernel_F \n"
" "KERNEL_F_FINALIZE" \n" " "KERNEL_F_FINALIZE" \n"
" sub "Z", "Z", #1 \n" " sub "Z", "Z", #1 \n"
".Liamax_kernel_F1: \n" "3: //iamax_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n" " ands "J", "N", #"N_REM_MASK" \n"
" ble .Liamax_kernel_L999 \n" " ble 9f //iamax_kernel_L999 \n"
".Liamax_kernel_F10: \n" "4: //iamax_kernel_F10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Liamax_kernel_F10 \n" " bne 4b //iamax_kernel_F10 \n"
" b .Liamax_kernel_L999 \n" " b 9f //iamax_kernel_L999 \n"
".Liamax_kernel_S_BEGIN: \n" "5: //iamax_kernel_S_BEGIN: \n"
" "INIT" \n" " "INIT" \n"
" subs "N", "N", #1 \n" " subs "N", "N", #1 \n"
" ble .Liamax_kernel_L999 \n" " ble 9f //iamax_kernel_L999 \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Liamax_kernel_S1 \n" " ble 7f //iamax_kernel_S1 \n"
".Liamax_kernel_S4: \n" "6: //iamax_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Liamax_kernel_S4 \n" " bne 6b //iamax_kernel_S4 \n"
".Liamax_kernel_S1: \n" "7: //iamax_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Liamax_kernel_L999 \n" " ble 9f //iamax_kernel_L999 \n"
".Liamax_kernel_S10: \n" "8: //iamax_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Liamax_kernel_S10 \n" " bne 8b //iamax_kernel_S10 \n"
".Liamax_kernel_L999: \n" "9: //iamax_kernel_L999: \n"
" mov x0, "INDEX" \n" " mov x0, "INDEX" \n"
" b .Liamax_kernel_DONE \n" " b 11f //iamax_kernel_DONE \n"
".Liamax_kernel_zero: \n" "10: //iamax_kernel_zero: \n"
" mov x0, xzr \n" " mov x0, xzr \n"
".Liamax_kernel_DONE: \n" "11: //iamax_kernel_DONE: \n"
" mov %[INDEX_], "INDEX" \n" " mov %[INDEX_], "INDEX" \n"
: [INDEX_] "=r" (index) //%0 : [INDEX_] "=r" (index) //%0

View File

@ -229,72 +229,72 @@ static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" mov "INC_X", %[INCX_] \n" " mov "INC_X", %[INCX_] \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lizamax_kernel_zero \n" " ble 10f //izamax_kernel_zero \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lizamax_kernel_zero \n" " ble 10f //izamax_kernel_zero \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lizamax_kernel_S_BEGIN \n" " bne 5f //izamax_kernel_S_BEGIN \n"
" mov x7, "X" \n" " mov x7, "X" \n"
".Lizamax_kernel_F_BEGIN: \n" "1: //izamax_kernel_F_BEGIN: \n"
" "INIT" \n" " "INIT" \n"
" subs "N", "N", #1 \n" " subs "N", "N", #1 \n"
" ble .Lizamax_kernel_L999 \n" " ble 9f //izamax_kernel_L999 \n"
" asr "J", "N", #"N_DIV_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lizamax_kernel_F1 \n" " beq 3f //izamax_kernel_F1 \n"
" add "Z", "Z", #1 \n" " add "Z", "Z", #1 \n"
".Lizamax_kernel_F: \n" "2: //izamax_kernel_F: \n"
" "KERNEL_F" \n" " "KERNEL_F" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lizamax_kernel_F \n" " bne 2b //izamax_kernel_F \n"
" "KERNEL_F_FINALIZE" \n" " "KERNEL_F_FINALIZE" \n"
" sub "Z", "Z", #1 \n" " sub "Z", "Z", #1 \n"
".Lizamax_kernel_F1: \n" "3: //izamax_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n" " ands "J", "N", #"N_REM_MASK" \n"
" ble .Lizamax_kernel_L999 \n" " ble 9f //izamax_kernel_L999 \n"
".Lizamax_kernel_F10: \n" "4: //izamax_kernel_F10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lizamax_kernel_F10 \n" " bne 4b //izamax_kernel_F10 \n"
" b .Lizamax_kernel_L999 \n" " b 9f //izamax_kernel_L999 \n"
".Lizamax_kernel_S_BEGIN: \n" "5: //izamax_kernel_S_BEGIN: \n"
" "INIT" \n" " "INIT" \n"
" subs "N", "N", #1 \n" " subs "N", "N", #1 \n"
" ble .Lizamax_kernel_L999 \n" " ble 9f //izamax_kernel_L999 \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lizamax_kernel_S1 \n" " ble 7f //izamax_kernel_S1 \n"
".Lizamax_kernel_S4: \n" "6: //izamax_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lizamax_kernel_S4 \n" " bne 6b //izamax_kernel_S4 \n"
".Lizamax_kernel_S1: \n" "7: //izamax_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lizamax_kernel_L999 \n" " ble 9f //izamax_kernel_L999 \n"
".Lizamax_kernel_S10: \n" "8: //izamax_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lizamax_kernel_S10 \n" " bne 8b //izamax_kernel_S10 \n"
".Lizamax_kernel_L999: \n" "9: //izamax_kernel_L999: \n"
" mov x0, "INDEX" \n" " mov x0, "INDEX" \n"
" b .Lizamax_kernel_DONE \n" " b 11f //izamax_kernel_DONE \n"
".Lizamax_kernel_zero: \n" "10: //izamax_kernel_zero: \n"
" mov x0, xzr \n" " mov x0, xzr \n"
".Lizamax_kernel_DONE: \n" "11: //izamax_kernel_DONE: \n"
" mov %[INDEX_], "INDEX" \n" " mov %[INDEX_], "INDEX" \n"
: [INDEX_] "=r" (index) //%0 : [INDEX_] "=r" (index) //%0

View File

@ -143,58 +143,58 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" fmov s6, "REG0" \n" " fmov s6, "REG0" \n"
" fmov s7, "REG0" \n" " fmov s7, "REG0" \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lasum_kernel_S_BEGIN \n" " bne 5f //asum_kernel_S_BEGIN \n"
".Lasum_kernel_F_BEGIN: \n" "1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #6 \n" " asr "J", "N", #6 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lasum_kernel_F1 \n" " beq 3f //asum_kernel_F1 \n"
".align 5 \n" ".align 5 \n"
".Lasum_kernel_F64: \n" "2: //asum_kernel_F64: \n"
" "KERNEL_F64" \n" " "KERNEL_F64" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F64 \n" " bne 2b //asum_kernel_F64 \n"
" "KERNEL_F64_FINALIZE" \n" " "KERNEL_F64_FINALIZE" \n"
".Lasum_kernel_F1: \n" "3: //asum_kernel_F1: \n"
" ands "J", "N", #63 \n" " ands "J", "N", #63 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_F10: \n" "4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F10 \n" " bne 4b //asum_kernel_F10 \n"
" b .Lasum_kernel_L999 \n" " b 9f //asum_kernel_L999 \n"
".Lasum_kernel_S_BEGIN: \n" "5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n" " "INIT_S" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lasum_kernel_S1 \n" " ble 7f //asum_kernel_S1 \n"
".Lasum_kernel_S4: \n" "6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S4 \n" " bne 6b //asum_kernel_S4 \n"
".Lasum_kernel_S1: \n" "7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_S10: \n" "8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S10 \n" " bne 8b //asum_kernel_S10 \n"
".Lasum_kernel_L999: \n" "9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMFD" \n" " fmov %[ASUM_], "SUMFD" \n"
: [ASUM_] "=r" (asum) //%0 : [ASUM_] "=r" (asum) //%0

View File

@ -227,58 +227,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" fmov d6, xzr \n" " fmov d6, xzr \n"
" fmov d7, xzr \n" " fmov d7, xzr \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lnrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lnrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lnrm2_kernel_S_BEGIN \n" " bne 5f //nrm2_kernel_S_BEGIN \n"
".Lnrm2_kernel_F_BEGIN: \n" "1: //nrm2_kernel_F_BEGIN: \n"
" asr "J", "N", #"N_DIV_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lnrm2_kernel_S_BEGIN \n" " beq 5f //nrm2_kernel_S_BEGIN \n"
" .align 5 \n" " .align 5 \n"
".Lnrm2_kernel_F: \n" "2: //nrm2_kernel_F: \n"
" "KERNEL_F" \n" " "KERNEL_F" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F \n" " bne 2b //nrm2_kernel_F \n"
" "KERNEL_F_FINALIZE" \n" " "KERNEL_F_FINALIZE" \n"
".Lnrm2_kernel_F1: \n" "3: //nrm2_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n" " ands "J", "N", #"N_REM_MASK" \n"
" ble .Lnrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_F10: \n" "4: //nrm2_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F10 \n" " bne 4b //nrm2_kernel_F10 \n"
" b .Lnrm2_kernel_L999 \n" " b 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_S_BEGIN: \n" "5: //nrm2_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lnrm2_kernel_S1 \n" " ble 7f //nrm2_kernel_S1 \n"
".Lnrm2_kernel_S4: \n" "6: //nrm2_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_S4 \n" " bne 6b //nrm2_kernel_S4 \n"
".Lnrm2_kernel_S1: \n" "7: //nrm2_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lnrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
".Lnrm2_kernel_S10: \n" "8: //nrm2_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_S10 \n" " bne 8b //nrm2_kernel_S10 \n"
".Lnrm2_kernel_L999: \n" "9: //nrm2_kernel_L999: \n"
" "KERNEL_FINALIZE" \n" " "KERNEL_FINALIZE" \n"
" fmov %[RET_], "SSQD" \n" " fmov %[RET_], "SSQD" \n"

View File

@ -143,58 +143,58 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" fmov d6, "REG0" \n" " fmov d6, "REG0" \n"
" fmov d7, "REG0" \n" " fmov d7, "REG0" \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Lasum_kernel_S_BEGIN \n" " bne 5f //asum_kernel_S_BEGIN \n"
".Lasum_kernel_F_BEGIN: \n" "1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #4 \n" " asr "J", "N", #4 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Lasum_kernel_F1 \n" " beq 3f //asum_kernel_F1 \n"
".align 5 \n" ".align 5 \n"
".Lasum_kernel_F16: \n" "2: //asum_kernel_F16: \n"
" "KERNEL_F16" \n" " "KERNEL_F16" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F16 \n" " bne 2b //asum_kernel_F16 \n"
" "KERNEL_F16_FINALIZE" \n" " "KERNEL_F16_FINALIZE" \n"
".Lasum_kernel_F1: \n" "3: //asum_kernel_F1: \n"
" ands "J", "N", #15 \n" " ands "J", "N", #15 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_F10: \n" "4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_F10 \n" " bne 4b //asum_kernel_F10 \n"
" b .Lasum_kernel_L999 \n" " b 9f //asum_kernel_L999 \n"
".Lasum_kernel_S_BEGIN: \n" "5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n" " "INIT_S" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Lasum_kernel_S1 \n" " ble 7f //asum_kernel_S1 \n"
".Lasum_kernel_S4: \n" "6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S4 \n" " bne 6b //asum_kernel_S4 \n"
".Lasum_kernel_S1: \n" "7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Lasum_kernel_L999 \n" " ble 9f //asum_kernel_L999 \n"
".Lasum_kernel_S10: \n" "8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n" " "KERNEL_S1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Lasum_kernel_S10 \n" " bne 8b //asum_kernel_S10 \n"
".Lasum_kernel_L999: \n" "9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMF" \n" " fmov %[ASUM_], "SUMF" \n"
: [ASUM_] "=r" (asum) //%0 : [ASUM_] "=r" (asum) //%0

View File

@ -218,61 +218,61 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
" fmov d6, xzr \n" " fmov d6, xzr \n"
" fmov d7, xzr \n" " fmov d7, xzr \n"
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble .Ldot_kernel_L999 \n" " ble 9f //dot_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n" " bne 5f //dot_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n" " cmp "INC_Y", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n" " bne 5f //dot_kernel_S_BEGIN \n"
".Ldot_kernel_F_BEGIN: \n" "1: //dot_kernel_F_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" beq .Ldot_kernel_F1 \n" " beq 3f //dot_kernel_F1 \n"
" .align 5 \n" " .align 5 \n"
".Ldot_kernel_F: \n" "2: //dot_kernel_F: \n"
" "KERNEL_F" \n" " "KERNEL_F" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_F \n" " bne 2b //dot_kernel_F \n"
" "KERNEL_F_FINALIZE" \n" " "KERNEL_F_FINALIZE" \n"
".Ldot_kernel_F1: \n" "3: //dot_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n" " ands "J", "N", #"N_REM_MASK" \n"
" ble .Ldot_kernel_L999 \n" " ble 9f //dot_kernel_L999 \n"
".Ldot_kernel_F10: \n" "4: //dot_kernel_F10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_F10 \n" " bne 4b //dot_kernel_F10 \n"
" b .Ldot_kernel_L999 \n" " b 9f //dot_kernel_L999 \n"
".Ldot_kernel_S_BEGIN: \n" "5: //dot_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #2 \n" " asr "J", "N", #2 \n"
" cmp "J", xzr \n" " cmp "J", xzr \n"
" ble .Ldot_kernel_S1 \n" " ble 7f //dot_kernel_S1 \n"
".Ldot_kernel_S4: \n" "6: //dot_kernel_S4: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_S4 \n" " bne 6b //dot_kernel_S4 \n"
".Ldot_kernel_S1: \n" "7: //dot_kernel_S1: \n"
" ands "J", "N", #3 \n" " ands "J", "N", #3 \n"
" ble .Ldot_kernel_L999 \n" " ble 9f //dot_kernel_L999 \n"
".Ldot_kernel_S10: \n" "8: //dot_kernel_S10: \n"
" "KERNEL_F1" \n" " "KERNEL_F1" \n"
" subs "J", "J", #1 \n" " subs "J", "J", #1 \n"
" bne .Ldot_kernel_S10 \n" " bne 8b //dot_kernel_S10 \n"
".Ldot_kernel_L999: \n" "9: //dot_kernel_L999: \n"
" str "DOTF", [%[DOTR_]] \n" " str "DOTF", [%[DOTR_]] \n"
" str "DOTI", [%[DOTI_]] \n" " str "DOTI", [%[DOTI_]] \n"

View File

@ -1,5 +1,6 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
set(LAPACK_SOURCES set(LAPACK_SOURCES

View File

@ -1,4 +1,5 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
enable_language(Fortran) enable_language(Fortran)
@ -35,4 +36,4 @@ add_test(NAME "${float_type}blas2"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
add_test(NAME "${float_type}blas3" add_test(NAME "${float_type}blas3"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
endforeach() endforeach()

View File

@ -1,4 +1,5 @@
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
set(OpenBLAS_utest_src set(OpenBLAS_utest_src
utest_main.c utest_main.c
@ -39,4 +40,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin}
) )
endif() endif()
add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})