From 9332042d5f6a630d00c868781a0eb3e660517bd7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Jul 2017 00:13:24 +0200 Subject: [PATCH 01/30] Fix range exceeding actual data size in quick_divide --- driver/level2/gbmv_thread.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index e86b565f8..6073a4856 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -233,6 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); #endif + if (range_m[num_cpu] > n) range_m[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = gbmv_kernel; From 857f61bc5dea502d07946a8637e70944b277ee2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Jul 2017 00:21:53 +0200 Subject: [PATCH 02/30] Fix range limit exceeding data size in last step --- driver/level2/sbmv_thread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 5718c0ec9..68ee93ee1 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; @@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; From 585c0010a5de7b42ab32ddb8230b4bc20eeedd43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Jul 2017 00:27:02 +0200 Subject: [PATCH 03/30] Fix range limit exceeding actual data size in last step --- driver/level2/tbmv_thread.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index 226a922e9..aaf4958e2 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; From dc249144157df2e2a877c89bc7799712ab172819 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 28 Jul 2017 11:49:39 +0530 Subject: [PATCH 04/30] check compiler is msvc instead of msvc --- cmake/prebuild.cmake | 11 +++++------ kernel/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a7f98bfb8..eacf518cd 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -63,15 +63,14 @@ set(GETARCH_SRC ${CPUIDEMO} ) -if (NOT MSVC) +if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC") + #Use generic for MSVC now + message("MSVC") + set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) +else() list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) endif () -if (MSVC) -#Use generic for MSVC now -set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) -endif() - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") # disable WindowsStore strict CRT checks set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8bfcccf17..2a5cb0d8f 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -22,7 +22,7 @@ ParseMakefileVars("${KERNELDIR}/KERNEL") ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") if (${ARCH} STREQUAL "x86") -if (NOT MSVC) +if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) else() GenerateNamedObjects("${KERNELDIR}/cpuid_win.c" "" "" false "" "" true) From ca17b4b75cc4c897b7c7e8dca51c9e465ed46ef8 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 28 Jul 2017 11:50:29 +0530 Subject: [PATCH 05/30] Fix complex support for MSVC headers --- common.h | 50 +++++++++++++++++++------------------- common_param.h | 12 ++++----- kernel/x86_64/cdot.c | 9 +++---- kernel/x86_64/zdot.c | 9 +++---- openblas_config_template.h | 2 +- 5 files changed, 40 insertions(+), 42 deletions(-) diff --git a/common.h b/common.h index 4463141c8..22ec383ac 100644 --- a/common.h +++ b/common.h @@ -495,6 +495,31 @@ static void __inline blas_lock(volatile BLASULONG *address){ #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) #endif +/* C99 supports complex floating numbers natively, which GCC also offers as an + extension since version 3.0. If neither are available, use a compatible + structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ +#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) + #define OPENBLAS_COMPLEX_C99 + #ifndef __cplusplus + #include + #endif + typedef float _Complex openblas_complex_float; + typedef double _Complex openblas_complex_double; + typedef xdouble _Complex openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) +#else + #define OPENBLAS_COMPLEX_STRUCT + typedef struct { float real, imag; } openblas_complex_float; + typedef struct { double real, imag; } openblas_complex_double; + typedef struct { xdouble real, imag; } openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} +#endif + #include "param.h" #include "common_param.h" @@ -524,31 +549,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ #include #endif // NOINCLUDE -/* C99 supports complex floating numbers natively, which GCC also offers as an - extension since version 3.0. If neither are available, use a compatible - structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) - #define OPENBLAS_COMPLEX_C99 - #ifndef __cplusplus - #include - #endif - typedef float _Complex openblas_complex_float; - typedef double _Complex openblas_complex_double; - typedef xdouble _Complex openblas_complex_xdouble; - #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) -#else - #define OPENBLAS_COMPLEX_STRUCT - typedef struct { float real, imag; } openblas_complex_float; - typedef struct { double real, imag; } openblas_complex_double; - typedef struct { xdouble real, imag; } openblas_complex_xdouble; - #define openblas_make_complex_float(real, imag) {(real), (imag)} - #define openblas_make_complex_double(real, imag) {(real), (imag)} - #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} -#endif - #ifdef XDOUBLE #define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) diff --git a/common_param.h b/common_param.h index 36d6149ea..0513ace9f 100644 --- a/common_param.h +++ b/common_param.h @@ -333,8 +333,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float _Complex (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float _Complex (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -496,8 +496,8 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG); int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - double _Complex (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - double _Complex (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -661,8 +661,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - xdouble _Complex (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - xdouble _Complex (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index ce396a2ce..5f01f7eeb 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -91,16 +91,15 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; BLASLONG ix,iy; - FLOAT _Complex result; FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; if ( n <= 0 ) { - result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; + OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; return(result); } @@ -160,11 +159,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in } #if !defined(CONJ) - result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; + OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; // CREAL(result) = dot[0] - dot[1]; // CIMAG(result) = dot[4] + dot[5]; #else - result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; + OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; // CREAL(result) = dot[0] + dot[1]; // CIMAG(result) = dot[4] - dot[5]; diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 2fcacc87a..d11c76647 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -86,18 +86,17 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; BLASLONG ix,iy; - FLOAT _Complex result; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) { // CREAL(result) = 0.0 ; // CIMAG(result) = 0.0 ; - result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); + OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return(result); } @@ -151,11 +150,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in } #if !defined(CONJ) - result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); + OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); // CREAL(result) = dot[0] - dot[1]; // CIMAG(result) = dot[2] + dot[3]; #else - result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); + OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); // CREAL(result) = dot[0] + dot[1]; // CIMAG(result) = dot[2] - dot[3]; diff --git a/openblas_config_template.h b/openblas_config_template.h index fd6171492..52dd49da2 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -59,7 +59,7 @@ typedef int blasint; extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) + (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) #define OPENBLAS_COMPLEX_C99 #ifndef __cplusplus #include From eb98fdddfc4ef33dc7fe9817da4e6fe0c369db38 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 20:38:16 +0530 Subject: [PATCH 06/30] typedefs only for c --- common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common.h b/common.h index 22ec383ac..ae98279ef 100644 --- a/common.h +++ b/common.h @@ -495,6 +495,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) #endif +#ifndef ASSEMBLER /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ @@ -519,6 +520,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #define openblas_make_complex_double(real, imag) {(real), (imag)} #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} #endif +#endif #include "param.h" #include "common_param.h" From 4c5df489db4792acfc861135b8d90e03de9805a6 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 20:59:17 +0530 Subject: [PATCH 07/30] clang on windows needs FU='' --- cmake/c_check.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 56ae612ea..a0784f09c 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -28,6 +28,8 @@ set(FU "") if(APPLE) set(FU "_") +elseif(MSVC AND ${CMAKE_C_COMPILER_ID} MATCHES "Clang") +set(FU "") elseif(MSVC) set(FU "_") elseif(UNIX) From ea1095135ebf60350cd8a60f35c299c1b100a224 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:00:32 +0530 Subject: [PATCH 08/30] Ninja complains that file openblas.def does not exist --- cmake/export.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/export.cmake b/cmake/export.cmake index 629f8fbc2..a9d1fc458 100644 --- a/cmake/export.cmake +++ b/cmake/export.cmake @@ -51,7 +51,8 @@ else() endif() add_custom_command( - TARGET ${OpenBLAS_LIBNAME} PRE_LINK + OUTPUT ${PROJECT_BINARY_DIR}/openblas.def + #TARGET ${OpenBLAS_LIBNAME} PRE_LINK COMMAND perl ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" COMMENT "Create openblas.def file" From b03d50b7944259e38af324bed466d842618348af Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:16:00 +0530 Subject: [PATCH 09/30] Test clang in appveyor.yml --- appveyor.yml | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index c9d8e47ac..837e81292 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -12,9 +12,6 @@ clone_folder: c:\projects\OpenBLAS init: - git config --global core.autocrlf input -build: - project: OpenBLAS.sln - clone_depth: 5 #branches to build @@ -27,16 +24,30 @@ branches: skip_tags: true matrix: - fast_finish: true + fast_finish: false skip_commits: # Add [av skip] to commit messages message: /\[av skip\]/ +environment: + matrix: + - COMPILER: cl + - COMPILER: clang-cl + +install: + - if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat + - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force + - if [%COMPILER%]==[clang-cl] conda install clangdev ninja cmake + before_build: - echo Running cmake... - cd c:\projects\OpenBLAS - - cmake -G "Visual Studio 12 Win64" . + - if [%COMPILER%]==[cl] cmake -G "Visual Studio 12 Win64" . + - if [%COMPILER%]==[clang-cl] cmake -G "Ninja" -DCMAKE_CXX_COMPILER_ID=clang-cl -DCMAKE_C_COMPILER_ID=clang-cl . + +build: + - cmake --build . test_script: - echo Running Test From c56d4881f9e14a0a7a25f9c70f5bfd5732ab5752 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:37:48 +0530 Subject: [PATCH 10/30] Fix appveyor.yml --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 837e81292..8394d3bac 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -46,7 +46,7 @@ before_build: - if [%COMPILER%]==[cl] cmake -G "Visual Studio 12 Win64" . - if [%COMPILER%]==[clang-cl] cmake -G "Ninja" -DCMAKE_CXX_COMPILER_ID=clang-cl -DCMAKE_C_COMPILER_ID=clang-cl . -build: +build_script: - cmake --build . test_script: From ff17e3eb9f4e79e661c5bad2180e2d4c65ecfa2d Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:47:15 +0530 Subject: [PATCH 11/30] build clang-cl first --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 8394d3bac..5022d179e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,8 +32,8 @@ skip_commits: environment: matrix: - - COMPILER: cl - COMPILER: clang-cl + - COMPILER: cl install: - if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat From 7a4ebf825bae6f994f064c37805356a01d62e87f Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:48:49 +0530 Subject: [PATCH 12/30] add --yes to conda in appveyor.yml --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 5022d179e..d6f175763 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -38,7 +38,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - - if [%COMPILER%]==[clang-cl] conda install clangdev ninja cmake + - if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake before_build: - echo Running cmake... From 1169f489a4246da9492322c37680f4b0672fdf64 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:54:32 +0530 Subject: [PATCH 13/30] Fix CMAKE_C_COMPILER in appveyor --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index d6f175763..7a1a17a7a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -44,7 +44,7 @@ before_build: - echo Running cmake... - cd c:\projects\OpenBLAS - if [%COMPILER%]==[cl] cmake -G "Visual Studio 12 Win64" . - - if [%COMPILER%]==[clang-cl] cmake -G "Ninja" -DCMAKE_CXX_COMPILER_ID=clang-cl -DCMAKE_C_COMPILER_ID=clang-cl . + - if [%COMPILER%]==[clang-cl] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl . build_script: - cmake --build . From a36e9764918d635f36ee51e7472174fbf6219115 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 21:58:53 +0530 Subject: [PATCH 14/30] vsvarsall in appveyor --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index 7a1a17a7a..d8b9a04b0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,6 +39,7 @@ install: - if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake + - if [%COMPILER%]==[clang-cl] "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x64 before_build: - echo Running cmake... From 7345795e64b4875cc187821d62cd230b1a0ad23b Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 22:16:53 +0530 Subject: [PATCH 15/30] Try adding RC to path --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index d8b9a04b0..1aa8e391d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -40,6 +40,7 @@ install: - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake - if [%COMPILER%]==[clang-cl] "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x64 + - if [%COMPILER%]==[clang-cl] set "PATH=%PATH%;C:\Program Files (x86)\Windows Kits\10\bin\x64" before_build: - echo Running cmake... From 5e0f67c666a48243b02dfe9c58e31c70c5d1ae6f Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 23:30:15 +0530 Subject: [PATCH 16/30] Make ARCH variable a CACHE variable --- cmake/c_check.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index a0784f09c..776850e11 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -61,7 +61,8 @@ endif () # CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. # # TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check -set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) +set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture") + if (${ARCH} STREQUAL "AMD64") set(ARCH "x86_64") endif () From 02c1f860551f18f2efaf874e59fd7be4ecaa4d38 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 23:42:38 +0530 Subject: [PATCH 17/30] Fix copying libopenblas.dll --- utest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index bd31ed9c6..46c46c5ff 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -35,7 +35,7 @@ endforeach() if (MSVC) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/$/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. + COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. ) endif() From 1a02a087a18297fc5f1bffd3d0911b9cf8ffa790 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2017 23:42:56 +0530 Subject: [PATCH 18/30] Fix vcvarsall call in appveyor --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 1aa8e391d..5ae6c44c0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,7 +39,7 @@ install: - if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake - - if [%COMPILER%]==[clang-cl] "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x64 + - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 - if [%COMPILER%]==[clang-cl] set "PATH=%PATH%;C:\Program Files (x86)\Windows Kits\10\bin\x64" before_build: From f00bbb9dbf6df423c183bc63aa078e501a5281e3 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 30 Jul 2017 00:00:37 +0530 Subject: [PATCH 19/30] Remove unnecessary line in appveyor --- appveyor.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 5ae6c44c0..cebd3aec6 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -40,7 +40,6 @@ install: - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 - - if [%COMPILER%]==[clang-cl] set "PATH=%PATH%;C:\Program Files (x86)\Windows Kits\10\bin\x64" before_build: - echo Running cmake... From 63cfa32691680505e6b9daf0997755178ddd3144 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Jul 2017 21:02:43 +0200 Subject: [PATCH 20/30] Rework __GLIBC_PREREQ checks to avoid breaking non-glibc builds --- driver/others/memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index b5b58b6fd..661f7c4eb 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -155,7 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; #endif - extern void openblas_warning(int verbose, const char * msg); #ifndef SMP @@ -187,25 +186,24 @@ int i,n; #if !defined(__GLIBC_PREREQ) return nums; -#endif -#if !__GLIBC_PREREQ(2, 3) +#else + #if !__GLIBC_PREREQ(2, 3) return nums; -#endif + #endif -#if !__GLIBC_PREREQ(2, 7) + #if !__GLIBC_PREREQ(2, 7) ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) return nums; n=0; -#if !__GLIBC_PREREQ(2, 6) + #if !__GLIBC_PREREQ(2, 6) for (i=0;i Date: Tue, 1 Aug 2017 19:28:08 +0200 Subject: [PATCH 21/30] Revert "Fix calculated range limit exceeding actual data size for last thread" --- driver/level2/gbmv_thread.c | 1 - driver/level2/sbmv_thread.c | 2 -- driver/level2/tbmv_thread.c | 3 --- 3 files changed, 6 deletions(-) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index 6073a4856..e86b565f8 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -233,7 +233,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); #endif - if (range_m[num_cpu] > n) range_m[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = gbmv_kernel; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 68ee93ee1..5718c0ec9 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -246,7 +246,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; @@ -286,7 +285,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index aaf4958e2..226a922e9 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -288,7 +288,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -328,7 +327,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -358,7 +356,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; From ae65f755729f633296e4d9f1bd570eb06ae9ee67 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 02:01:44 +0200 Subject: [PATCH 22/30] Travis: Simplify configuration using Build Stages and APT addon Using APT addon has nice side-effect - you don't need sudo anymore, so it can run on Travis containers-based infrastructure that is much faster than their VMs infrastructure (used when sudo is needed). You've been still running on Ubuntu Presty builders, but new default is Trusty. Thus I've explicitly set `dist: presty` to let it stay on Presty, to not change build environment by this commit. --- .travis.yml | 92 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index 63b469716..878a547fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,67 @@ +# XXX: Precise is already deprecated, new default is Trusty. +# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming +dist: precise +sudo: false language: c +compiler: gcc + +jobs: + include: + - &test-ubuntu + stage: test + addons: + apt: + packages: + - gfortran + script: + - set -e + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64" + + - <<: *test-ubuntu + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-ubuntu + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 INTERFACE64=1" + + - <<: *test-ubuntu + addons: + apt: + packages: + - gcc-multilib + - gfortran-multilib + env: + - TARGET_BOX=LINUX32 + - BTYPE="BINARY=32" + + - stage: test + addons: + apt: + packages: + - binutils-mingw-w64-x86-64 + - gcc-mingw-w64-x86-64 + - gfortran-mingw-w64-x86-64 + script: + - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE + env: + - TARGET_BOX=WIN64 + - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" + +# whitelist +branches: + only: + - master + - develop notifications: webhooks: @@ -7,32 +70,3 @@ notifications: on_success: change # options: [always|never|change] default: always on_failure: always # options: [always|never|change] default: always on_start: never # options: [always|never|change] default: always - -compiler: - - gcc - -env: - - TARGET_BOX=LINUX64 BTYPE="BINARY=64" - - TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1" - - TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1" - - TARGET_BOX=LINUX32 BTYPE="BINARY=32" - - TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" - -before_install: - - sudo apt-get update -qq - - sudo apt-get install -qq gfortran - - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi - - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi - -script: - - set -e - - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE - - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - -# whitelist -branches: - only: - - master - - develop \ No newline at end of file From e0bd5b5c0ebbe908ed88de1c21a33919d4a7d6fe Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 02:31:27 +0200 Subject: [PATCH 23/30] Travis: Build and test also on Alpine Linux (musl libc) Alpine jobs needs sudo (for chroot), so they run on VMs infrastructure. That's why they are much slower than other jobs. --- .travis.yml | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 878a547fa..cb0a86597 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,9 +13,10 @@ jobs: apt: packages: - gfortran + before_script: &common-before + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - set -e - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE @@ -51,12 +52,58 @@ jobs: - binutils-mingw-w64-x86-64 - gcc-mingw-w64-x86-64 - gfortran-mingw-w64-x86-64 + before_script: *common-before script: - - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - TARGET_BOX=WIN64 - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" + # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. + # These jobs needs sudo, so Travis runs them on VM-based infrastructure + # which is slower than container-based infrastructure used for jobs + # that don't require sudo. + - &test-alpine + stage: test + dist: trusty + sudo: true + language: minimal + before_install: + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ + && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + install: + - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + before_script: *common-before + script: + - set -e + - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - alpine make -C test $COMMON_FLAGS $BTYPE + - alpine make -C ctest $COMMON_FLAGS $BTYPE + - alpine make -C utest $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64" + + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 INTERFACE64=1" + + # Build with the same flags as Alpine do in OpenBLAS package. + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" + + allow_failures: + - <<: *test-alpine-openmp + # whitelist branches: only: From 08c7d1ddf8df51a8f9ec7199dfada1a419d2a101 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 14:32:17 +0200 Subject: [PATCH 24/30] Travis: Disable some gcc warnings to avoid exceeding Travis limit See: https://github.com/xianyi/OpenBLAS/pull/1255#issuecomment-318628666 --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index cb0a86597..56b3273b0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -77,7 +77,9 @@ jobs: before_script: *common-before script: - set -e + # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" - alpine make -C test $COMMON_FLAGS $BTYPE - alpine make -C ctest $COMMON_FLAGS $BTYPE - alpine make -C utest $COMMON_FLAGS $BTYPE From 486a485bb781b6a2c017c9924197911c31f7b1f4 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Fri, 28 Jul 2017 18:08:44 +0200 Subject: [PATCH 25/30] Travis: Allow job LINUX64_MUSL USE_OPENMP=1 to fail See: https://github.com/xianyi/OpenBLAS/pull/1255#issuecomment-318692183 --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 56b3273b0..b1a13acd9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -87,7 +87,10 @@ jobs: - TARGET_BOX=LINUX64_MUSL - BTYPE="BINARY=64" - - <<: *test-alpine + # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, + # so it's "allowed to fail" for now (see allow_failures). + - &test-alpine-openmp + <<: *test-alpine env: - TARGET_BOX=LINUX64_MUSL - BTYPE="BINARY=64 USE_OPENMP=1" From c4e5ba1bfe8c7c4e263d5c14f4034e657347b591 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Aug 2017 00:37:58 +0200 Subject: [PATCH 26/30] Make sure that range_n of last thread never exceeds the actual data size when splitting the workload --- driver/level2/gbmv_thread.c | 2 ++ driver/level2/sbmv_thread.c | 3 +++ driver/level2/spmv_thread.c | 2 ++ driver/level2/symv_thread.c | 4 +++- driver/level2/tbmv_thread.c | 3 +++ driver/level2/tpmv_thread.c | 4 +++- driver/level2/trmv_thread.c | 4 +++- 7 files changed, 19 insertions(+), 3 deletions(-) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index e86b565f8..9d374676e 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -230,8 +230,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT #ifndef TRANSA range_m[num_cpu] = num_cpu * ((m + 15) & ~15); + if (range_m[num_cpu] > m) range_m[num_cpu] = m; #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); + if (range_m[num_cpu] > n) range_m[num_cpu] = n; #endif queue[num_cpu].mode = mode; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 5718c0ec9..ce841ee0e 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; @@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; @@ -316,6 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * ((n + 15) & ~15); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index 035300841..0b4087430 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -246,6 +246,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = spmv_kernel; @@ -285,6 +286,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = spmv_kernel; diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index 6580178f1..8d4cd249c 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -177,7 +177,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + if (range_n[num_cpu] > m) range_n[num_cpu] = m; + queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; @@ -225,6 +226,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = symv_kernel; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index 226a922e9..aaf4958e2 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + if (range_n[num_cpu] > n) range_n[num_cpu] = n; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index c91b52775..79438ba29 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -307,7 +307,8 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + if (range_n[num_cpu] > m) range_n[num_cpu] = m; + queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; @@ -346,6 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 0a155366c..8b931a0e8 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -346,7 +346,8 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + if (range_n[num_cpu] > m) range_n[num_cpu] = m; + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -385,6 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; From 0ba64cee60c90f2533b918bc026283f5d5288a89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Aug 2017 12:03:54 +0200 Subject: [PATCH 27/30] Update trmv_thread.c --- driver/level2/trmv_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 8b931a0e8..24b881a93 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -347,7 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From 4899d67f7db0545eb2bc820a7dcd8172b1024179 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 2 Aug 2017 11:28:45 -0700 Subject: [PATCH 28/30] THUDNERX2T99: Fix clang compilation --- kernel/arm64/casum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/copy_thunderx2t99.c | 44 ++++++++++++------------ kernel/arm64/dasum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/dot_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/dznrm2_thunderx2t99.c | 44 ++++++++++++------------ kernel/arm64/iamax_thunderx2t99.c | 54 +++++++++++++++--------------- kernel/arm64/izamax_thunderx2t99.c | 52 ++++++++++++++-------------- kernel/arm64/sasum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/scnrm2_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/zasum_thunderx2t99.c | 42 +++++++++++------------ kernel/arm64/zdot_thunderx2t99.c | 42 +++++++++++------------ 11 files changed, 244 insertions(+), 244 deletions(-) diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c index 4dac2e8ab..cd5d936c5 100644 --- a/kernel/arm64/casum_thunderx2t99.c +++ b/kernel/arm64/casum_thunderx2t99.c @@ -147,57 +147,57 @@ static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov s6, "REG0" \n" " fmov s7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #5 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" - ".Lasum_kernel_F32: \n" + "2: //asum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F32 \n" + " bne 2b //asum_kernel_F32 \n" " "KERNEL_F32_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #31 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMFD" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index 49526a15e..bd67b48b0 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -90,62 +90,62 @@ static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_ " mov "Y", %[Y_] \n" " mov "INC_Y", %[INCY_] \n" " cmp "N", xzr \n" - " ble .Lcopy_kernel_L999 \n" + " ble 8f //copy_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lcopy_kernel_S_BEGIN \n" + " bne 4f //copy_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" - " bne .Lcopy_kernel_S_BEGIN \n" + " bne 4f //copy_kernel_S_BEGIN \n" - ".Lcopy_kernel_F_BEGIN: \n" + "// .Lcopy_kernel_F_BEGIN: \n" " "INIT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Lcopy_kernel_F1 \n" + " beq 2f //copy_kernel_F1 \n" " .align 5 \n" - ".Lcopy_kernel_F: \n" + "1: //copy_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_F \n" + " bne 1b //copy_kernel_F \n" - ".Lcopy_kernel_F1: \n" + "2: //copy_kernel_F1: \n" #if defined(COMPLEX) && defined(DOUBLE) - " b .Lcopy_kernel_L999 \n" + " b 8f //copy_kernel_L999 \n" #else " ands "J", "N", #"N_REM_MASK" \n" - " ble .Lcopy_kernel_L999 \n" + " ble 8f //copy_kernel_L999 \n" #endif - ".Lcopy_kernel_F10: \n" + "3: //copy_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_F10 \n" - " b .Lcopy_kernel_L999 \n" + " bne 3b //copy_kernel_F10 \n" + " b 8f //copy_kernel_L999 \n" - ".Lcopy_kernel_S_BEGIN: \n" + "4: //copy_kernel_S_BEGIN: \n" " "INIT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lcopy_kernel_S1 \n" + " ble 6f //copy_kernel_S1 \n" - ".Lcopy_kernel_S4: \n" + "5: //copy_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_S4 \n" + " bne 5b //copy_kernel_S4 \n" - ".Lcopy_kernel_S1: \n" + "6: //copy_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lcopy_kernel_L999 \n" + " ble 8f //copy_kernel_L999 \n" - ".Lcopy_kernel_S10: \n" + "7: //copy_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lcopy_kernel_S10 \n" + " bne 7b //copy_kernel_S10 \n" - ".Lcopy_kernel_L999: \n" + "8: //copy_kernel_L999: \n" : : [N_] "r" (n), //%1 diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index bd6bb055d..ba12fc776 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -141,58 +141,58 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov d6, "REG0" \n" " fmov d7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #5 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" ".align 5 \n" - ".Lasum_kernel_F32: \n" + "2: //asum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F32 \n" + " bne 2b //asum_kernel_F32 \n" " "KERNEL_F32_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #31 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMF" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 6d54fd805..8eeb94f36 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -291,61 +291,61 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" - ".Ldot_kernel_F_BEGIN: \n" + "1: //dot_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Ldot_kernel_F1 \n" + " beq 3f //dot_kernel_F1 \n" " .align 5 \n" - ".Ldot_kernel_F: \n" + "2: //dot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F \n" + " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" - ".Ldot_kernel_F1: \n" + "3: //dot_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_F10: \n" + "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F10 \n" - " b .Ldot_kernel_L999 \n" + " bne 4b //dot_kernel_F10 \n" + " b 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S_BEGIN: \n" + "5: //dot_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Ldot_kernel_S1 \n" + " ble 7f //dot_kernel_S1 \n" - ".Ldot_kernel_S4: \n" + "6: //dot_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S4 \n" + " bne 6b //dot_kernel_S4 \n" - ".Ldot_kernel_S1: \n" + "7: //dot_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S10: \n" + "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S10 \n" + " bne 8b //dot_kernel_S10 \n" - ".Ldot_kernel_L999: \n" + "9: //dot_kernel_L999: \n" " str "DOTF", [%[DOT_]] \n" : diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index a6613d7a5..2aea9b4a9 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -74,33 +74,33 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmov "SCALE", xzr \n" " fmov "SSQ", #1.0 \n" " cmp "N", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F_BEGIN: \n" + "1: //nrm2_kernel_F_BEGIN: \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" - " beq .Lnrm2_kernel_L999 \n" + " beq 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F_ZERO_SKIP: \n" + "2: //nrm2_kernel_F_ZERO_SKIP: \n" " ldr d4, ["X"] \n" " fcmp d4, "REGZERO" \n" - " bne .Lnrm2_kernel_F_INIT \n" + " bne 3f //nrm2_kernel_F_INIT \n" #if defined(COMPLEX) " ldr d4, ["X", #8] \n" " fcmp d4, "REGZERO" \n" - " bne .Lnrm2_kernel_F_INIT_I \n" + " bne 4f //nrm2_kernel_F_INIT_I \n" #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" - " beq .Lnrm2_kernel_L999 \n" - " b .Lnrm2_kernel_F_ZERO_SKIP \n" + " beq 9f //nrm2_kernel_L999 \n" + " b 2b //nrm2_kernel_F_ZERO_SKIP \n" - ".Lnrm2_kernel_F_INIT: \n" + "3: //nrm2_kernel_F_INIT: \n" " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" @@ -112,7 +112,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fadd "SSQ", "SSQ", d4 \n" " fmov "SCALE", "CUR_MAX" \n" #if defined(COMPLEX) - ".Lnrm2_kernel_F_INIT_I: \n" + "4: //nrm2_kernel_F_INIT_I: \n" " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" @@ -126,16 +126,16 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" - " beq .Lnrm2_kernel_L999 \n" + " beq 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F_START: \n" + "5: //nrm2_kernel_F_START: \n" " cmp "INC_X", #"SZ" \n" - " bne .Lnrm2_kernel_F1 \n" + " bne 8f //nrm2_kernel_F1 \n" " asr "K", "J", #4 \n" " cmp "K", xzr \n" - " beq .Lnrm2_kernel_F1 \n" + " beq 8f //nrm2_kernel_F1 \n" - ".Lnrm2_kernel_F: \n" + "6: //nrm2_kernel_F: \n" " ldp q16, q17, ["X"] \n" " ldp q18, q19, ["X", #32] \n" " ldp q20, q21, ["X", #64] \n" @@ -255,13 +255,13 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmov "SCALE", "CUR_MAX" \n" #endif " subs "K", "K", #1 \n" - " bne .Lnrm2_kernel_F \n" + " bne 6b //nrm2_kernel_F \n" - ".Lnrm2_kernel_F_DONE: \n" + "7: //nrm2_kernel_F_DONE: \n" " ands "J", "J", #15 \n" - " beq .Lnrm2_kernel_L999 \n" + " beq 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F1: \n" + "8: //nrm2_kernel_F1: \n" " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" @@ -286,9 +286,9 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_F1 \n" + " bne 8b //nrm2_kernel_F1 \n" - ".Lnrm2_kernel_L999: \n" + "9: //nrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c index bc5f3c3ca..a11b18419 100644 --- a/kernel/arm64/iamax_thunderx2t99.c +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -208,7 +208,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n #endif -static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static BLASLONG __attribute__((noinline)) iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG index = 0; @@ -220,72 +220,72 @@ static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " mov "INC_X", %[INCX_] \n" " cmp "N", xzr \n" - " ble .Liamax_kernel_zero \n" + " ble 10f //iamax_kernel_zero \n" " cmp "INC_X", xzr \n" - " ble .Liamax_kernel_zero \n" + " ble 10f //iamax_kernel_zero \n" " cmp "INC_X", #1 \n" - " bne .Liamax_kernel_S_BEGIN \n" + " bne 5f //iamax_kernel_S_BEGIN \n" " mov x7, "X" \n" - ".Liamax_kernel_F_BEGIN: \n" + "1: //iamax_kernel_F_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Liamax_kernel_F1 \n" + " beq 3f //iamax_kernel_F1 \n" " add "Z", "Z", #1 \n" - ".Liamax_kernel_F: \n" + "2: //iamax_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_F \n" + " bne 2b //iamax_kernel_F \n" " "KERNEL_F_FINALIZE" \n" " sub "Z", "Z", #1 \n" - ".Liamax_kernel_F1: \n" + "3: //iamax_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" - ".Liamax_kernel_F10: \n" + "4: //iamax_kernel_F10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_F10 \n" - " b .Liamax_kernel_L999 \n" + " bne 4b //iamax_kernel_F10 \n" + " b 9f //iamax_kernel_L999 \n" - ".Liamax_kernel_S_BEGIN: \n" + "5: //iamax_kernel_S_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Liamax_kernel_S1 \n" + " ble 7f //iamax_kernel_S1 \n" - ".Liamax_kernel_S4: \n" + "6: //iamax_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_S4 \n" + " bne 6b //iamax_kernel_S4 \n" - ".Liamax_kernel_S1: \n" + "7: //iamax_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Liamax_kernel_L999 \n" + " ble 9f //iamax_kernel_L999 \n" - ".Liamax_kernel_S10: \n" + "8: //iamax_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Liamax_kernel_S10 \n" + " bne 8b //iamax_kernel_S10 \n" - ".Liamax_kernel_L999: \n" + "9: //iamax_kernel_L999: \n" " mov x0, "INDEX" \n" - " b .Liamax_kernel_DONE \n" + " b 11f //iamax_kernel_DONE \n" - ".Liamax_kernel_zero: \n" + "10: //iamax_kernel_zero: \n" " mov x0, xzr \n" - ".Liamax_kernel_DONE: \n" + "11: //iamax_kernel_DONE: \n" " mov %[INDEX_], "INDEX" \n" : [INDEX_] "=r" (index) //%0 diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c index 152f936b6..8d70b0515 100644 --- a/kernel/arm64/izamax_thunderx2t99.c +++ b/kernel/arm64/izamax_thunderx2t99.c @@ -229,72 +229,72 @@ static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " mov "INC_X", %[INCX_] \n" " cmp "N", xzr \n" - " ble .Lizamax_kernel_zero \n" + " ble 10f //izamax_kernel_zero \n" " cmp "INC_X", xzr \n" - " ble .Lizamax_kernel_zero \n" + " ble 10f //izamax_kernel_zero \n" " cmp "INC_X", #1 \n" - " bne .Lizamax_kernel_S_BEGIN \n" + " bne 5f //izamax_kernel_S_BEGIN \n" " mov x7, "X" \n" - ".Lizamax_kernel_F_BEGIN: \n" + "1: //izamax_kernel_F_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Lizamax_kernel_F1 \n" + " beq 3f //izamax_kernel_F1 \n" " add "Z", "Z", #1 \n" - ".Lizamax_kernel_F: \n" + "2: //izamax_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_F \n" + " bne 2b //izamax_kernel_F \n" " "KERNEL_F_FINALIZE" \n" " sub "Z", "Z", #1 \n" - ".Lizamax_kernel_F1: \n" + "3: //izamax_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" - ".Lizamax_kernel_F10: \n" + "4: //izamax_kernel_F10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_F10 \n" - " b .Lizamax_kernel_L999 \n" + " bne 4b //izamax_kernel_F10 \n" + " b 9f //izamax_kernel_L999 \n" - ".Lizamax_kernel_S_BEGIN: \n" + "5: //izamax_kernel_S_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lizamax_kernel_S1 \n" + " ble 7f //izamax_kernel_S1 \n" - ".Lizamax_kernel_S4: \n" + "6: //izamax_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_S4 \n" + " bne 6b //izamax_kernel_S4 \n" - ".Lizamax_kernel_S1: \n" + "7: //izamax_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lizamax_kernel_L999 \n" + " ble 9f //izamax_kernel_L999 \n" - ".Lizamax_kernel_S10: \n" + "8: //izamax_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lizamax_kernel_S10 \n" + " bne 8b //izamax_kernel_S10 \n" - ".Lizamax_kernel_L999: \n" + "9: //izamax_kernel_L999: \n" " mov x0, "INDEX" \n" - " b .Lizamax_kernel_DONE \n" + " b 11f //izamax_kernel_DONE \n" - ".Lizamax_kernel_zero: \n" + "10: //izamax_kernel_zero: \n" " mov x0, xzr \n" - ".Lizamax_kernel_DONE: \n" + "11: //izamax_kernel_DONE: \n" " mov %[INDEX_], "INDEX" \n" : [INDEX_] "=r" (index) //%0 diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 767535dae..28fc34c62 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -143,58 +143,58 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov s6, "REG0" \n" " fmov s7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #6 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" ".align 5 \n" - ".Lasum_kernel_F64: \n" + "2: //asum_kernel_F64: \n" " "KERNEL_F64" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F64 \n" + " bne 2b //asum_kernel_F64 \n" " "KERNEL_F64_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #63 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMFD" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c index c745dcc03..b8df4962b 100644 --- a/kernel/arm64/scnrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -227,58 +227,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lnrm2_kernel_S_BEGIN \n" + " bne 5f //nrm2_kernel_S_BEGIN \n" - ".Lnrm2_kernel_F_BEGIN: \n" + "1: //nrm2_kernel_F_BEGIN: \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Lnrm2_kernel_S_BEGIN \n" + " beq 5f //nrm2_kernel_S_BEGIN \n" " .align 5 \n" - ".Lnrm2_kernel_F: \n" + "2: //nrm2_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_F \n" + " bne 2b //nrm2_kernel_F \n" " "KERNEL_F_FINALIZE" \n" - ".Lnrm2_kernel_F1: \n" + "3: //nrm2_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_F10: \n" + "4: //nrm2_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_F10 \n" - " b .Lnrm2_kernel_L999 \n" + " bne 4b //nrm2_kernel_F10 \n" + " b 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_S_BEGIN: \n" + "5: //nrm2_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lnrm2_kernel_S1 \n" + " ble 7f //nrm2_kernel_S1 \n" - ".Lnrm2_kernel_S4: \n" + "6: //nrm2_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_S4 \n" + " bne 6b //nrm2_kernel_S4 \n" - ".Lnrm2_kernel_S1: \n" + "7: //nrm2_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lnrm2_kernel_L999 \n" + " ble 9f //nrm2_kernel_L999 \n" - ".Lnrm2_kernel_S10: \n" + "8: //nrm2_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lnrm2_kernel_S10 \n" + " bne 8b //nrm2_kernel_S10 \n" - ".Lnrm2_kernel_L999: \n" + "9: //nrm2_kernel_L999: \n" " "KERNEL_FINALIZE" \n" " fmov %[RET_], "SSQD" \n" diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index e0f4ae21a..140e5a741 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -143,58 +143,58 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " fmov d6, "REG0" \n" " fmov d7, "REG0" \n" " cmp "N", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", xzr \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Lasum_kernel_S_BEGIN \n" + " bne 5f //asum_kernel_S_BEGIN \n" - ".Lasum_kernel_F_BEGIN: \n" + "1: //asum_kernel_F_BEGIN: \n" " asr "J", "N", #4 \n" " cmp "J", xzr \n" - " beq .Lasum_kernel_F1 \n" + " beq 3f //asum_kernel_F1 \n" ".align 5 \n" - ".Lasum_kernel_F16: \n" + "2: //asum_kernel_F16: \n" " "KERNEL_F16" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F16 \n" + " bne 2b //asum_kernel_F16 \n" " "KERNEL_F16_FINALIZE" \n" - ".Lasum_kernel_F1: \n" + "3: //asum_kernel_F1: \n" " ands "J", "N", #15 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_F10: \n" + "4: //asum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_F10 \n" - " b .Lasum_kernel_L999 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S_BEGIN: \n" + "5: //asum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Lasum_kernel_S1 \n" + " ble 7f //asum_kernel_S1 \n" - ".Lasum_kernel_S4: \n" + "6: //asum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S4 \n" + " bne 6b //asum_kernel_S4 \n" - ".Lasum_kernel_S1: \n" + "7: //asum_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Lasum_kernel_L999 \n" + " ble 9f //asum_kernel_L999 \n" - ".Lasum_kernel_S10: \n" + "8: //asum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" - " bne .Lasum_kernel_S10 \n" + " bne 8b //asum_kernel_S10 \n" - ".Lasum_kernel_L999: \n" + "9: //asum_kernel_L999: \n" " fmov %[ASUM_], "SUMF" \n" : [ASUM_] "=r" (asum) //%0 diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 64823871f..70d683077 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -218,61 +218,61 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" " cmp "INC_X", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" - " bne .Ldot_kernel_S_BEGIN \n" + " bne 5f //dot_kernel_S_BEGIN \n" - ".Ldot_kernel_F_BEGIN: \n" + "1: //dot_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" - " beq .Ldot_kernel_F1 \n" + " beq 3f //dot_kernel_F1 \n" " .align 5 \n" - ".Ldot_kernel_F: \n" + "2: //dot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F \n" + " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" - ".Ldot_kernel_F1: \n" + "3: //dot_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_F10: \n" + "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_F10 \n" - " b .Ldot_kernel_L999 \n" + " bne 4b //dot_kernel_F10 \n" + " b 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S_BEGIN: \n" + "5: //dot_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" - " ble .Ldot_kernel_S1 \n" + " ble 7f //dot_kernel_S1 \n" - ".Ldot_kernel_S4: \n" + "6: //dot_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S4 \n" + " bne 6b //dot_kernel_S4 \n" - ".Ldot_kernel_S1: \n" + "7: //dot_kernel_S1: \n" " ands "J", "N", #3 \n" - " ble .Ldot_kernel_L999 \n" + " ble 9f //dot_kernel_L999 \n" - ".Ldot_kernel_S10: \n" + "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" - " bne .Ldot_kernel_S10 \n" + " bne 8b //dot_kernel_S10 \n" - ".Ldot_kernel_L999: \n" + "9: //dot_kernel_L999: \n" " str "DOTF", [%[DOTR_]] \n" " str "DOTI", [%[DOTI_]] \n" From ca32b66a1c682bd227fd28380fe65768a751c127 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 4 Aug 2017 07:57:20 +0530 Subject: [PATCH 29/30] New utest for clang --- utest/CMakeLists.txt | 10 +++++--- utest/utest_main2.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 utest/utest_main2.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index d51125bc1..a7f3871c3 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -1,10 +1,14 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) -set(OpenBLAS_utest_src - utest_main.c - test_amax.c +if (MSVC AND "${CMAKE_C_COMPILER_ID}" MATCHES Clang) + set(OpenBLAS_utest_src utest_main2.c) +else () + set(OpenBLAS_utest_src + utest_main.c + test_amax.c ) +endif () if (NOT NO_LAPACK) set(OpenBLAS_utest_src diff --git a/utest/utest_main2.c b/utest/utest_main2.c new file mode 100644 index 000000000..565872b16 --- /dev/null +++ b/utest/utest_main2.c @@ -0,0 +1,61 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include + +#define CTEST_MAIN +#define CTEST_SEGFAULT +#define CTEST_ADD_TESTS_MANUALLY + +#include "openblas_utest.h" + +CTEST(amax, samax){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, 2.2, -3.3}; + te_max=BLASFUNC(samax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + +int main(int argc, const char ** argv){ + + CTEST_ADD(amax, samax); + int num_fail=0; + + num_fail=ctest_main(argc, argv); + + return num_fail; +} + From 7abbe40980d7f03a35645565443f53aa416b16fa Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 4 Aug 2017 08:04:16 +0530 Subject: [PATCH 30/30] Build all branches so that appveyor works in forks --- appveyor.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index cebd3aec6..087b22665 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -14,13 +14,6 @@ init: clone_depth: 5 -#branches to build -branches: - only: - - master - - develop - - cmake - skip_tags: true matrix: