From 9332042d5f6a630d00c868781a0eb3e660517bd7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Jul 2017 00:13:24 +0200 Subject: [PATCH 01/15] Fix range exceeding actual data size in quick_divide --- driver/level2/gbmv_thread.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index e86b565f8..6073a4856 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -233,6 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); #endif + if (range_m[num_cpu] > n) range_m[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = gbmv_kernel; From 857f61bc5dea502d07946a8637e70944b277ee2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Jul 2017 00:21:53 +0200 Subject: [PATCH 02/15] Fix range limit exceeding data size in last step --- driver/level2/sbmv_thread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 5718c0ec9..68ee93ee1 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; @@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; From 585c0010a5de7b42ab32ddb8230b4bc20eeedd43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Jul 2017 00:27:02 +0200 Subject: [PATCH 03/15] Fix range limit exceeding actual data size in last step --- driver/level2/tbmv_thread.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index 226a922e9..aaf4958e2 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; From 63cfa32691680505e6b9daf0997755178ddd3144 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Jul 2017 21:02:43 +0200 Subject: [PATCH 04/15] Rework __GLIBC_PREREQ checks to avoid breaking non-glibc builds --- driver/others/memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index b5b58b6fd..661f7c4eb 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -155,7 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; #endif - extern void openblas_warning(int verbose, const char * msg); #ifndef SMP @@ -187,25 +186,24 @@ int i,n; #if !defined(__GLIBC_PREREQ) return nums; -#endif -#if !__GLIBC_PREREQ(2, 3) +#else + #if !__GLIBC_PREREQ(2, 3) return nums; -#endif + #endif -#if !__GLIBC_PREREQ(2, 7) + #if !__GLIBC_PREREQ(2, 7) ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) return nums; n=0; -#if !__GLIBC_PREREQ(2, 6) + #if !__GLIBC_PREREQ(2, 6) for (i=0;i Date: Tue, 1 Aug 2017 11:02:00 +0530 Subject: [PATCH 05/15] Don't change timestamps --- .gitignore | 1 + CMakeLists.txt | 44 ++++++++++++++++++-------------------------- cmake/c_check.cmake | 4 ++-- cmake/f_check.cmake | 4 ++-- cmake/prebuild.cmake | 14 +++++++++----- cmake/utils.cmake | 4 +++- 6 files changed, 35 insertions(+), 36 deletions(-) diff --git a/.gitignore b/.gitignore index a2ec7dd08..e9d08ca7e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.def *.o *.out +*.tmp lapack-3.1.1 lapack-3.1.1.tgz lapack-3.4.1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 0243ca963..45111ce30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,42 +231,34 @@ install(TARGETS ${OpenBLAS_LIBNAME} # Install include files set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX}) - ADD_CUSTOM_COMMAND( - OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h - COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h - ) - ADD_CUSTOM_TARGET(genconfig - ALL - DEPENDS openblas_config.h - ) - add_dependencies(genconfig ${OpenBLAS_LIBNAME}) + execute_process(COMMAND ${GENCONFIG_BIN} + ${CMAKE_CURRENT_SOURCE_DIR}/config.h + ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h + OUTPUT_VARIABLE OPENBLAS_CONFIG_H_CONTENTS) + + file(WRITE ${CMAKE_BINARY_DIR}/openblas_config.tmp "${OPENBLAS_CONFIG_H_CONTENTS}") + configure_file(${CMAKE_BINARY_DIR}/openblas_config.tmp ${CMAKE_BINARY_DIR}/openblas_config.h COPYONLY) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - ADD_CUSTOM_TARGET(genf77blas - ALL - COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h - ) - add_dependencies(genf77blas ${OpenBLAS_LIBNAME}) - + file(WRITE ${CMAKE_BINARY_DIR}/f77blas.tmp "") + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n") + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h COMMON_INTERFACE_H_CONTENTS) + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "${COMMON_INTERFACE_H_CONTENTS}") + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#endif") + configure_file(${CMAKE_BINARY_DIR}/f77blas.tmp ${CMAKE_BINARY_DIR}/f77blas.h COPYONLY) install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - ADD_CUSTOM_TARGET(gencblas - ALL - COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" - COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h - ) - add_dependencies(gencblas ${OpenBLAS_LIBNAME}) - + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) + string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") + configure_file(${CMAKE_BINARY_DIR}/cblas.tmp ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h COPYONLY) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() @@ -277,7 +269,7 @@ if(NOT NO_LAPACKE) install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) ADD_CUSTOM_TARGET(genlapacke - COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 56ae612ea..fc376c659 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -84,7 +84,7 @@ endif () string(TOUPPER ${ARCH} UC_ARCH) -file(WRITE ${TARGET_CONF} +file(WRITE ${TARGET_CONF_TEMP} "#define OS_${HOST_OS}\t1\n" "#define ARCH_${UC_ARCH}\t1\n" "#define C_${COMPILER_ID}\t1\n" @@ -92,7 +92,7 @@ file(WRITE ${TARGET_CONF} "#define FUNDERSCORE\t${FU}\n") if (${HOST_OS} STREQUAL "WINDOWSSTORE") - file(APPEND ${TARGET_CONF} + file(APPEND ${TARGET_CONF_TEMP} "#define OS_WINNT\t1\n") endif () diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index e8fe4bfa7..6eee027a5 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -44,7 +44,7 @@ if (NOT ONLY_CBLAS) # TODO: set FEXTRALIB flags a la f_check? set(BU "_") - file(APPEND ${TARGET_CONF} + file(APPEND ${TARGET_CONF_TEMP} "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n" "#define NEED2UNDERSCORES 0\n") @@ -56,7 +56,7 @@ else () set(NO_FBLAS 1) #set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler set(BU "_") - file(APPEND ${TARGET_CONF} + file(APPEND ${TARGET_CONF_TEMP} "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n") endif() diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a7f98bfb8..422d4cb37 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -51,6 +51,7 @@ else() set(TARGET_CONF "config.h") endif () +set(TARGET_CONF_TEMP "${TARGET_CONF}.tmp") include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") if (NOT NOFORTRAN) @@ -80,10 +81,11 @@ endif () set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) +configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -101,16 +103,17 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE G message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") # append config data from getarch to the TARGET file and read in CMake vars -file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) +file(APPEND ${TARGET_CONF_TEMP} ${GETARCH_CONF_OUT}) ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) +configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) @@ -125,7 +128,8 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) # append config data from getarch_2nd to the TARGET file and read in CMake vars -file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) +file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT}) +configure_file(${TARGET_CONF_TEMP} ${TARGET_CONF} COPYONLY) ParseGetArchVars(${GETARCH2_MAKE_OUT}) # compile get_config_h @@ -145,4 +149,4 @@ if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") if (NOT ${GEN_CONFIG_H_RESULT}) MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") endif () -endif () \ No newline at end of file +endif () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6e2a98069..f20ce0533 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -234,7 +234,9 @@ function(GenerateNamedObjects sources_in) string(REPLACE ";" "\n#define " define_source "${obj_defines}") string(REPLACE "=" " " define_source "${define_source}") - file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"") + file(WRITE ${new_source_file}.tmp "#define ${define_source}\n#include \"${old_source_file}\"") + configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) + file(REMOVE ${new_source_file}.tmp) list(APPEND SRC_LIST_OUT ${new_source_file}) endforeach () From c9c13a8a0c7b6aca3d26b2249702e711d7d1e95f Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 1 Aug 2017 14:58:49 +0530 Subject: [PATCH 06/15] Fix installing cblas.h and fix tabs --- CMakeLists.txt | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45111ce30..e75f915c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,34 +232,33 @@ install(TARGETS ${OpenBLAS_LIBNAME} # Install include files set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX}) - execute_process(COMMAND ${GENCONFIG_BIN} - ${CMAKE_CURRENT_SOURCE_DIR}/config.h - ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h - OUTPUT_VARIABLE OPENBLAS_CONFIG_H_CONTENTS) + execute_process(COMMAND ${GENCONFIG_BIN} + ${CMAKE_CURRENT_SOURCE_DIR}/config.h + ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h + OUTPUT_VARIABLE OPENBLAS_CONFIG_H_CONTENTS) - file(WRITE ${CMAKE_BINARY_DIR}/openblas_config.tmp "${OPENBLAS_CONFIG_H_CONTENTS}") - configure_file(${CMAKE_BINARY_DIR}/openblas_config.tmp ${CMAKE_BINARY_DIR}/openblas_config.h COPYONLY) + file(WRITE ${CMAKE_BINARY_DIR}/openblas_config.tmp "${OPENBLAS_CONFIG_H_CONTENTS}") + configure_file(${CMAKE_BINARY_DIR}/openblas_config.tmp ${CMAKE_BINARY_DIR}/openblas_config.h COPYONLY) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - file(WRITE ${CMAKE_BINARY_DIR}/f77blas.tmp "") - file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n") - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h COMMON_INTERFACE_H_CONTENTS) - file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "${COMMON_INTERFACE_H_CONTENTS}") - file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#endif") - configure_file(${CMAKE_BINARY_DIR}/f77blas.tmp ${CMAKE_BINARY_DIR}/f77blas.h COPYONLY) + file(WRITE ${CMAKE_BINARY_DIR}/f77blas.tmp "") + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n") + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h COMMON_INTERFACE_H_CONTENTS) + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "${COMMON_INTERFACE_H_CONTENTS}") + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#endif") + configure_file(${CMAKE_BINARY_DIR}/f77blas.tmp ${CMAKE_BINARY_DIR}/f77blas.h COPYONLY) install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) - string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") - configure_file(${CMAKE_BINARY_DIR}/cblas.tmp ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h COPYONLY) - install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) + string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) endif() if(NOT NO_LAPACKE) From d245caa49a6f41a90a77ed885e526838132ddec3 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 1 Aug 2017 15:10:41 +0530 Subject: [PATCH 07/15] Support out-of-source build --- cmake/prebuild.cmake | 8 ++++---- ctest/CMakeLists.txt | 1 + driver/level2/CMakeLists.txt | 1 + driver/level3/CMakeLists.txt | 1 + driver/others/CMakeLists.txt | 1 + interface/CMakeLists.txt | 1 + kernel/CMakeLists.txt | 1 + lapack/CMakeLists.txt | 1 + test/CMakeLists.txt | 3 ++- utest/CMakeLists.txt | 3 ++- 10 files changed, 15 insertions(+), 6 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 422d4cb37..21dc7a380 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -51,7 +51,7 @@ else() set(TARGET_CONF "config.h") endif () -set(TARGET_CONF_TEMP "${TARGET_CONF}.tmp") +set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp") include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") if (NOT NOFORTRAN) @@ -85,7 +85,7 @@ configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I${PROJECT_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -113,7 +113,7 @@ configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I${PROJECT_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) @@ -129,7 +129,7 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE # append config data from getarch_2nd to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT}) -configure_file(${TARGET_CONF_TEMP} ${TARGET_CONF} COPYONLY) +configure_file(${TARGET_CONF_TEMP} ${PROJECT_BINARY_DIR}/${TARGET_CONF} COPYONLY) ParseGetArchVars(${GETARCH2_MAKE_OUT}) # compile get_config_h diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index addcffeac..73070d429 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -1,4 +1,5 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index f444469bd..aa5f03107 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,5 +1,6 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 36677a942..c91e85f9c 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -1,4 +1,5 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 8e0be1e0e..376cc66c4 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -1,4 +1,5 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) if (${CORE} STREQUAL "PPC440") set(MEMORY memory_qalloc.c) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 1722dc661..8b25344c0 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,5 +1,6 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) set(BLAS1_SOURCES diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8bfcccf17..0c3569259 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1,5 +1,6 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") # Makefile diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index afd583c11..b613c6c2b 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,5 +1,6 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) set(LAPACK_SOURCES diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5e9baf928..69a1ceb91 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,5 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) @@ -35,4 +36,4 @@ add_test(NAME "${float_type}blas2" COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) -endforeach() \ No newline at end of file +endforeach() diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index bd31ed9c6..e52fb2c90 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -1,4 +1,5 @@ include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) set(OpenBLAS_utest_src utest_main.c @@ -39,4 +40,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin} ) endif() -add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) \ No newline at end of file +add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) From 8381591f5601afca6648031d7daa9406cf183091 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 1 Aug 2017 15:27:19 +0530 Subject: [PATCH 08/15] No need of a temp file for f77blas.h --- CMakeLists.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e75f915c2..16bce3526 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -244,12 +244,11 @@ install(TARGETS ${OpenBLAS_LIBNAME} message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - file(WRITE ${CMAKE_BINARY_DIR}/f77blas.tmp "") - file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n") + file(WRITE ${CMAKE_BINARY_DIR}/f77blas.h "") + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "#ifndef OPENBLAS_F77BLAS_H\n#define OPENBLAS_F77BLAS_H\n#include \"openblas_config.h\"\n") file(READ ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h COMMON_INTERFACE_H_CONTENTS) - file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "${COMMON_INTERFACE_H_CONTENTS}") - file(APPEND ${CMAKE_BINARY_DIR}/f77blas.tmp "#endif") - configure_file(${CMAKE_BINARY_DIR}/f77blas.tmp ${CMAKE_BINARY_DIR}/f77blas.h COPYONLY) + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "${COMMON_INTERFACE_H_CONTENTS}") + file(APPEND ${CMAKE_BINARY_DIR}/f77blas.h "#endif") install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NO_CBLAS) From 89ec2be11099b84c02e7dbd22f5bb10824601bfb Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 1 Aug 2017 15:47:14 +0530 Subject: [PATCH 09/15] Fix lapacke copying --- cmake/lapacke.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index f56a22886..fd5aee134 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -2391,6 +2391,6 @@ foreach (Utils_FILE ${Utils_SRC}) endforeach () set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") -execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h") +configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) include_directories(${lapacke_include_dir}) set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") From a6f533b2486708447e8e8b3996bed9e18d3a3ff5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Aug 2017 19:28:08 +0200 Subject: [PATCH 10/15] Revert "Fix calculated range limit exceeding actual data size for last thread" --- driver/level2/gbmv_thread.c | 1 - driver/level2/sbmv_thread.c | 2 -- driver/level2/tbmv_thread.c | 3 --- 3 files changed, 6 deletions(-) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index 6073a4856..e86b565f8 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -233,7 +233,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); #endif - if (range_m[num_cpu] > n) range_m[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = gbmv_kernel; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 68ee93ee1..5718c0ec9 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -246,7 +246,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; @@ -286,7 +285,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index aaf4958e2..226a922e9 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -288,7 +288,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -328,7 +327,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -358,7 +356,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; From ae65f755729f633296e4d9f1bd570eb06ae9ee67 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 02:01:44 +0200 Subject: [PATCH 11/15] Travis: Simplify configuration using Build Stages and APT addon Using APT addon has nice side-effect - you don't need sudo anymore, so it can run on Travis containers-based infrastructure that is much faster than their VMs infrastructure (used when sudo is needed). You've been still running on Ubuntu Presty builders, but new default is Trusty. Thus I've explicitly set `dist: presty` to let it stay on Presty, to not change build environment by this commit. --- .travis.yml | 92 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index 63b469716..878a547fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,67 @@ +# XXX: Precise is already deprecated, new default is Trusty. +# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming +dist: precise +sudo: false language: c +compiler: gcc + +jobs: + include: + - &test-ubuntu + stage: test + addons: + apt: + packages: + - gfortran + script: + - set -e + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64" + + - <<: *test-ubuntu + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-ubuntu + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 INTERFACE64=1" + + - <<: *test-ubuntu + addons: + apt: + packages: + - gcc-multilib + - gfortran-multilib + env: + - TARGET_BOX=LINUX32 + - BTYPE="BINARY=32" + + - stage: test + addons: + apt: + packages: + - binutils-mingw-w64-x86-64 + - gcc-mingw-w64-x86-64 + - gfortran-mingw-w64-x86-64 + script: + - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE + env: + - TARGET_BOX=WIN64 + - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" + +# whitelist +branches: + only: + - master + - develop notifications: webhooks: @@ -7,32 +70,3 @@ notifications: on_success: change # options: [always|never|change] default: always on_failure: always # options: [always|never|change] default: always on_start: never # options: [always|never|change] default: always - -compiler: - - gcc - -env: - - TARGET_BOX=LINUX64 BTYPE="BINARY=64" - - TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1" - - TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1" - - TARGET_BOX=LINUX32 BTYPE="BINARY=32" - - TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" - -before_install: - - sudo apt-get update -qq - - sudo apt-get install -qq gfortran - - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi - - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi - -script: - - set -e - - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE - - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - -# whitelist -branches: - only: - - master - - develop \ No newline at end of file From e0bd5b5c0ebbe908ed88de1c21a33919d4a7d6fe Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 02:31:27 +0200 Subject: [PATCH 12/15] Travis: Build and test also on Alpine Linux (musl libc) Alpine jobs needs sudo (for chroot), so they run on VMs infrastructure. That's why they are much slower than other jobs. --- .travis.yml | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 878a547fa..cb0a86597 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,9 +13,10 @@ jobs: apt: packages: - gfortran + before_script: &common-before + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - set -e - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE @@ -51,12 +52,58 @@ jobs: - binutils-mingw-w64-x86-64 - gcc-mingw-w64-x86-64 - gfortran-mingw-w64-x86-64 + before_script: *common-before script: - - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - TARGET_BOX=WIN64 - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" + # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. + # These jobs needs sudo, so Travis runs them on VM-based infrastructure + # which is slower than container-based infrastructure used for jobs + # that don't require sudo. + - &test-alpine + stage: test + dist: trusty + sudo: true + language: minimal + before_install: + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ + && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + install: + - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + before_script: *common-before + script: + - set -e + - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - alpine make -C test $COMMON_FLAGS $BTYPE + - alpine make -C ctest $COMMON_FLAGS $BTYPE + - alpine make -C utest $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64" + + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 INTERFACE64=1" + + # Build with the same flags as Alpine do in OpenBLAS package. + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" + + allow_failures: + - <<: *test-alpine-openmp + # whitelist branches: only: From 08c7d1ddf8df51a8f9ec7199dfada1a419d2a101 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 14:32:17 +0200 Subject: [PATCH 13/15] Travis: Disable some gcc warnings to avoid exceeding Travis limit See: https://github.com/xianyi/OpenBLAS/pull/1255#issuecomment-318628666 --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index cb0a86597..56b3273b0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -77,7 +77,9 @@ jobs: before_script: *common-before script: - set -e + # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" - alpine make -C test $COMMON_FLAGS $BTYPE - alpine make -C ctest $COMMON_FLAGS $BTYPE - alpine make -C utest $COMMON_FLAGS $BTYPE From 486a485bb781b6a2c017c9924197911c31f7b1f4 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 18:08:44 +0200 Subject: [PATCH 14/15] Travis: Allow job LINUX64_MUSL USE_OPENMP=1 to fail See: https://github.com/xianyi/OpenBLAS/pull/1255#issuecomment-318692183 --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 56b3273b0..b1a13acd9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -87,7 +87,10 @@ jobs: - TARGET_BOX=LINUX64_MUSL - BTYPE="BINARY=64" - - <<: *test-alpine + # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, + # so it's "allowed to fail" for now (see allow_failures). + - &test-alpine-openmp + <<: *test-alpine env: - TARGET_BOX=LINUX64_MUSL - BTYPE="BINARY=64 USE_OPENMP=1" From 4899d67f7db0545eb2bc820a7dcd8172b1024179 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 2 Aug 2017 11:28:45 -0700 Subject: [PATCH 15/15] THUDNERX2T99: Fix clang compilation --- kernel/arm64/casum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/copy_thunderx2t99.c | 44 ++++++++++++------------ kernel/arm64/dasum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/dot_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/dznrm2_thunderx2t99.c | 44 ++++++++++++------------ kernel/arm64/iamax_thunderx2t99.c | 54 +++++++++++++++--------------- kernel/arm64/izamax_thunderx2t99.c | 52 ++++++++++++++-------------- kernel/arm64/sasum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/scnrm2_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/zasum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/zdot_thunderx2t99.c | 42 +++++++++++------------ 11 files changed, 244 insertions(+), 244 deletions(-) diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c index 4dac2e8ab..cd5d936c5 100644 --- a/kernel/arm64/casum_thunderx2t99.c +++ b/kernel/arm64/casum_thunderx2t99.c @@ -147,57 +147,57 @@ static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov s6, "REG0" \n" " fmov s7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #5 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" - ".Lasum_kernel_F32: \n" + "2: //asum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F32 \n" + " bne 2b //asum_kernel_F32 \n" " "KERNEL_F32_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #31 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMFD" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index 49526a15e..bd67b48b0 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -90,62 +90,62 @@ static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_ " mov "Y", %[Y_] \n" " mov "INC_Y", %[INCY_] \n" " cmp "N", xzr \n" - " ble .Lcopy_kernel_L999 \n" + " ble 8f //copy_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lcopy_kernel_S_BEGIN \n" + " bne 4f //copy_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" - " bne .Lcopy_kernel_S_BEGIN \n" + " bne 4f //copy_kernel_S_BEGIN \n" - ".Lcopy_kernel_F_BEGIN: \n" + "// .Lcopy_kernel_F_BEGIN: \n" " "INIT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Lcopy_kernel_F1 \n" + " beq 2f //copy_kernel_F1 \n" " .align 5 \n" - ".Lcopy_kernel_F: \n" + "1: //copy_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_F \n" + " bne 1b //copy_kernel_F \n" - ".Lcopy_kernel_F1: \n" + "2: //copy_kernel_F1: \n" #if defined(COMPLEX) && defined(DOUBLE) - " b .Lcopy_kernel_L999 \n" + " b 8f //copy_kernel_L999 \n" #else " ands "J", "N", #"N_REM_MASK" \n" - " ble .Lcopy_kernel_L999 \n" + " ble 8f //copy_kernel_L999 \n" #endif - ".Lcopy_kernel_F10: \n" + "3: //copy_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_F10 \n" - " b .Lcopy_kernel_L999 \n" + " bne 3b //copy_kernel_F10 \n" + " b 8f //copy_kernel_L999 \n" - ".Lcopy_kernel_S_BEGIN: \n" + "4: //copy_kernel_S_BEGIN: \n" " "INIT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lcopy_kernel_S1 \n" + " ble 6f //copy_kernel_S1 \n" - ".Lcopy_kernel_S4: \n" + "5: //copy_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_S4 \n" + " bne 5b //copy_kernel_S4 \n" - ".Lcopy_kernel_S1: \n" + "6: //copy_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lcopy_kernel_L999 \n" + " ble 8f //copy_kernel_L999 \n" - ".Lcopy_kernel_S10: \n" + "7: //copy_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_S10 \n" + " bne 7b //copy_kernel_S10 \n" - ".Lcopy_kernel_L999: \n" + "8: //copy_kernel_L999: \n" : : [N_] "r" (n), //%1 diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index bd6bb055d..ba12fc776 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -141,58 +141,58 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov d6, "REG0" \n" " fmov d7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #5 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" ".align 5 \n" - ".Lasum_kernel_F32: \n" + "2: //asum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F32 \n" + " bne 2b //asum_kernel_F32 \n" " "KERNEL_F32_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #31 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMF" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 6d54fd805..8eeb94f36 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -291,61 +291,61 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" - ".Ldot_kernel_F_BEGIN: \n" + "1: //dot_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Ldot_kernel_F1 \n" + " beq 3f //dot_kernel_F1 \n" " .align 5 \n" - ".Ldot_kernel_F: \n" + "2: //dot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F \n" + " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" - ".Ldot_kernel_F1: \n" + "3: //dot_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_F10: \n" + "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F10 \n" - " b .Ldot_kernel_L999 \n" + " bne 4b //dot_kernel_F10 \n" + " b 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S_BEGIN: \n" + "5: //dot_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Ldot_kernel_S1 \n" + " ble 7f //dot_kernel_S1 \n" - ".Ldot_kernel_S4: \n" + "6: //dot_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S4 \n" + " bne 6b //dot_kernel_S4 \n" - ".Ldot_kernel_S1: \n" + "7: //dot_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S10: \n" + "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S10 \n" + " bne 8b //dot_kernel_S10 \n" - ".Ldot_kernel_L999: \n" + "9: //dot_kernel_L999: \n" " str "DOTF", [%[DOT_]] \n" : diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index a6613d7a5..2aea9b4a9 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -74,33 +74,33 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmov "SCALE", xzr \n" " fmov "SSQ", #1.0 \n" " cmp "N", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F_BEGIN: \n" + "1: //nrm2_kernel_F_BEGIN: \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" - " beq .Lnrm2_kernel_L999 \n" + " beq 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F_ZERO_SKIP: \n" + "2: //nrm2_kernel_F_ZERO_SKIP: \n" " ldr d4, ["X"] \n" " fcmp d4, "REGZERO" \n" - " bne .Lnrm2_kernel_F_INIT \n" + " bne 3f //nrm2_kernel_F_INIT \n" #if defined(COMPLEX) " ldr d4, ["X", #8] \n" " fcmp d4, "REGZERO" \n" - " bne .Lnrm2_kernel_F_INIT_I \n" + " bne 4f //nrm2_kernel_F_INIT_I \n" #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" - " beq .Lnrm2_kernel_L999 \n" - " b .Lnrm2_kernel_F_ZERO_SKIP \n" + " beq 9f //nrm2_kernel_L999 \n" + " b 2b //nrm2_kernel_F_ZERO_SKIP \n" - ".Lnrm2_kernel_F_INIT: \n" + "3: //nrm2_kernel_F_INIT: \n" " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" @@ -112,7 +112,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fadd "SSQ", "SSQ", d4 \n" " fmov "SCALE", "CUR_MAX" \n" #if defined(COMPLEX) - ".Lnrm2_kernel_F_INIT_I: \n" + "4: //nrm2_kernel_F_INIT_I: \n" " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" @@ -126,16 +126,16 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" - " beq .Lnrm2_kernel_L999 \n" + " beq 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F_START: \n" + "5: //nrm2_kernel_F_START: \n" " cmp "INC_X", #"SZ" \n" - " bne .Lnrm2_kernel_F1 \n" + " bne 8f //nrm2_kernel_F1 \n" " asr "K", "J", #4 \n" " cmp "K", xzr \n" - " beq .Lnrm2_kernel_F1 \n" + " beq 8f //nrm2_kernel_F1 \n" - ".Lnrm2_kernel_F: \n" + "6: //nrm2_kernel_F: \n" " ldp q16, q17, ["X"] \n" " ldp q18, q19, ["X", #32] \n" " ldp q20, q21, ["X", #64] \n" @@ -255,13 +255,13 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmov "SCALE", "CUR_MAX" \n" #endif " subs "K", "K", #1 \n" - " bne .Lnrm2_kernel_F \n" + " bne 6b //nrm2_kernel_F \n" - ".Lnrm2_kernel_F_DONE: \n" + "7: //nrm2_kernel_F_DONE: \n" " ands "J", "J", #15 \n" - " beq .Lnrm2_kernel_L999 \n" + " beq 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F1: \n" + "8: //nrm2_kernel_F1: \n" " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" @@ -286,9 +286,9 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_F1 \n" + " bne 8b //nrm2_kernel_F1 \n" - ".Lnrm2_kernel_L999: \n" + "9: //nrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c index bc5f3c3ca..a11b18419 100644 --- a/kernel/arm64/iamax_thunderx2t99.c +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -208,7 +208,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n #endif -static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static BLASLONG __attribute__((noinline)) iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG index = 0; @@ -220,72 +220,72 @@ static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " mov "INC_X", %[INCX_] \n" " cmp "N", xzr \n" - " ble .Liamax_kernel_zero \n" + " ble 10f //iamax_kernel_zero \n" " cmp "INC_X", xzr \n" - " ble .Liamax_kernel_zero \n" + " ble 10f //iamax_kernel_zero \n" " cmp "INC_X", #1 \n" - " bne .Liamax_kernel_S_BEGIN \n" + " bne 5f //iamax_kernel_S_BEGIN \n" " mov x7, "X" \n" - ".Liamax_kernel_F_BEGIN: \n" + "1: //iamax_kernel_F_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Liamax_kernel_F1 \n" + " beq 3f //iamax_kernel_F1 \n" " add "Z", "Z", #1 \n" - ".Liamax_kernel_F: \n" + "2: //iamax_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_F \n" + " bne 2b //iamax_kernel_F \n" " "KERNEL_F_FINALIZE" \n" " sub "Z", "Z", #1 \n" - ".Liamax_kernel_F1: \n" + "3: //iamax_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" - ".Liamax_kernel_F10: \n" + "4: //iamax_kernel_F10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_F10 \n" - " b .Liamax_kernel_L999 \n" + " bne 4b //iamax_kernel_F10 \n" + " b 9f //iamax_kernel_L999 \n" - ".Liamax_kernel_S_BEGIN: \n" + "5: //iamax_kernel_S_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Liamax_kernel_S1 \n" + " ble 7f //iamax_kernel_S1 \n" - ".Liamax_kernel_S4: \n" + "6: //iamax_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_S4 \n" + " bne 6b //iamax_kernel_S4 \n" - ".Liamax_kernel_S1: \n" + "7: //iamax_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" - ".Liamax_kernel_S10: \n" + "8: //iamax_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_S10 \n" + " bne 8b //iamax_kernel_S10 \n" - ".Liamax_kernel_L999: \n" + "9: //iamax_kernel_L999: \n" " mov x0, "INDEX" \n" - " b .Liamax_kernel_DONE \n" + " b 11f //iamax_kernel_DONE \n" - ".Liamax_kernel_zero: \n" + "10: //iamax_kernel_zero: \n" " mov x0, xzr \n" - ".Liamax_kernel_DONE: \n" + "11: //iamax_kernel_DONE: \n" " mov %[INDEX_], "INDEX" \n" : [INDEX_] "=r" (index) //%0 diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c index 152f936b6..8d70b0515 100644 --- a/kernel/arm64/izamax_thunderx2t99.c +++ b/kernel/arm64/izamax_thunderx2t99.c @@ -229,72 +229,72 @@ static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " mov "INC_X", %[INCX_] \n" " cmp "N", xzr \n" - " ble .Lizamax_kernel_zero \n" + " ble 10f //izamax_kernel_zero \n" " cmp "INC_X", xzr \n" - " ble .Lizamax_kernel_zero \n" + " ble 10f //izamax_kernel_zero \n" " cmp "INC_X", #1 \n" - " bne .Lizamax_kernel_S_BEGIN \n" + " bne 5f //izamax_kernel_S_BEGIN \n" " mov x7, "X" \n" - ".Lizamax_kernel_F_BEGIN: \n" + "1: //izamax_kernel_F_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Lizamax_kernel_F1 \n" + " beq 3f //izamax_kernel_F1 \n" " add "Z", "Z", #1 \n" - ".Lizamax_kernel_F: \n" + "2: //izamax_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_F \n" + " bne 2b //izamax_kernel_F \n" " "KERNEL_F_FINALIZE" \n" " sub "Z", "Z", #1 \n" - ".Lizamax_kernel_F1: \n" + "3: //izamax_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" - ".Lizamax_kernel_F10: \n" + "4: //izamax_kernel_F10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_F10 \n" - " b .Lizamax_kernel_L999 \n" + " bne 4b //izamax_kernel_F10 \n" + " b 9f //izamax_kernel_L999 \n" - ".Lizamax_kernel_S_BEGIN: \n" + "5: //izamax_kernel_S_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lizamax_kernel_S1 \n" + " ble 7f //izamax_kernel_S1 \n" - ".Lizamax_kernel_S4: \n" + "6: //izamax_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_S4 \n" + " bne 6b //izamax_kernel_S4 \n" - ".Lizamax_kernel_S1: \n" + "7: //izamax_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" - ".Lizamax_kernel_S10: \n" + "8: //izamax_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_S10 \n" + " bne 8b //izamax_kernel_S10 \n" - ".Lizamax_kernel_L999: \n" + "9: //izamax_kernel_L999: \n" " mov x0, "INDEX" \n" - " b .Lizamax_kernel_DONE \n" + " b 11f //izamax_kernel_DONE \n" - ".Lizamax_kernel_zero: \n" + "10: //izamax_kernel_zero: \n" " mov x0, xzr \n" - ".Lizamax_kernel_DONE: \n" + "11: //izamax_kernel_DONE: \n" " mov %[INDEX_], "INDEX" \n" : [INDEX_] "=r" (index) //%0 diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 767535dae..28fc34c62 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -143,58 +143,58 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov s6, "REG0" \n" " fmov s7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #6 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" ".align 5 \n" - ".Lasum_kernel_F64: \n" + "2: //asum_kernel_F64: \n" " "KERNEL_F64" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F64 \n" + " bne 2b //asum_kernel_F64 \n" " "KERNEL_F64_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #63 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMFD" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c index c745dcc03..b8df4962b 100644 --- a/kernel/arm64/scnrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -227,58 +227,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lnrm2_kernel_S_BEGIN \n" + " bne 5f //nrm2_kernel_S_BEGIN \n" - ".Lnrm2_kernel_F_BEGIN: \n" + "1: //nrm2_kernel_F_BEGIN: \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Lnrm2_kernel_S_BEGIN \n" + " beq 5f //nrm2_kernel_S_BEGIN \n" " .align 5 \n" - ".Lnrm2_kernel_F: \n" + "2: //nrm2_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_F \n" + " bne 2b //nrm2_kernel_F \n" " "KERNEL_F_FINALIZE" \n" - ".Lnrm2_kernel_F1: \n" + "3: //nrm2_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F10: \n" + "4: //nrm2_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_F10 \n" - " b .Lnrm2_kernel_L999 \n" + " bne 4b //nrm2_kernel_F10 \n" + " b 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_S_BEGIN: \n" + "5: //nrm2_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lnrm2_kernel_S1 \n" + " ble 7f //nrm2_kernel_S1 \n" - ".Lnrm2_kernel_S4: \n" + "6: //nrm2_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_S4 \n" + " bne 6b //nrm2_kernel_S4 \n" - ".Lnrm2_kernel_S1: \n" + "7: //nrm2_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_S10: \n" + "8: //nrm2_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_S10 \n" + " bne 8b //nrm2_kernel_S10 \n" - ".Lnrm2_kernel_L999: \n" + "9: //nrm2_kernel_L999: \n" " "KERNEL_FINALIZE" \n" " fmov %[RET_], "SSQD" \n" diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index e0f4ae21a..140e5a741 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -143,58 +143,58 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov d6, "REG0" \n" " fmov d7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #4 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" ".align 5 \n" - ".Lasum_kernel_F16: \n" + "2: //asum_kernel_F16: \n" " "KERNEL_F16" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F16 \n" + " bne 2b //asum_kernel_F16 \n" " "KERNEL_F16_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #15 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMF" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 64823871f..70d683077 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -218,61 +218,61 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" - ".Ldot_kernel_F_BEGIN: \n" + "1: //dot_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Ldot_kernel_F1 \n" + " beq 3f //dot_kernel_F1 \n" " .align 5 \n" - ".Ldot_kernel_F: \n" + "2: //dot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F \n" + " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" - ".Ldot_kernel_F1: \n" + "3: //dot_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_F10: \n" + "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F10 \n" - " b .Ldot_kernel_L999 \n" + " bne 4b //dot_kernel_F10 \n" + " b 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S_BEGIN: \n" + "5: //dot_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Ldot_kernel_S1 \n" + " ble 7f //dot_kernel_S1 \n" - ".Ldot_kernel_S4: \n" + "6: //dot_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S4 \n" + " bne 6b //dot_kernel_S4 \n" - ".Ldot_kernel_S1: \n" + "7: //dot_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S10: \n" + "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S10 \n" + " bne 8b //dot_kernel_S10 \n" - ".Ldot_kernel_L999: \n" + "9: //dot_kernel_L999: \n" " str "DOTF", [%[DOTR_]] \n" " str "DOTI", [%[DOTI_]] \n"