From 73ffabe6ba46f167f5f51596ce9f4f3da02e551d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Feb 2022 20:06:14 +0100 Subject: [PATCH 01/41] Guard uses of _mm512_reduce_add_p? --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 5 +++++ kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 5 +++++ kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 5 +++++ kernel/x86_64/sgemm_small_kernel_tn_skylakex.c | 5 +++++ 4 files changed, 20 insertions(+) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index d9b380fff..5d7b3c66b 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -588,3 +589,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_nn.c +#endif + diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 18c797283..e63873988 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -320,3 +321,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_tn.c +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 9bc7a7c58..215add010 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -610,3 +611,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_nn.c +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index 5a9a4ea32..f394b5b3a 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -314,3 +315,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_tn.c +#endif + From 80eb581c838349ac4eef08cd688b2754f47e88ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Feb 2022 20:10:59 +0100 Subject: [PATCH 02/41] Fix non-portable u_int64_t --- kernel/x86_64/sbgemm_ncopy_16_cooperlake.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c index 95ed82d7c..7ed03d70d 100644 --- a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, }; - u_int64_t permute_table2[] = { + uint64_t permute_table2[] = { 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, }; From c62f8e2c01bfc2f4fad800be198108bf0f7a7e61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Feb 2022 20:12:20 +0100 Subject: [PATCH 03/41] Prevent compiler attempts to use k0 as mask register --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 7af51b6d8..b94aa3c84 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -114,10 +114,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) #define _MASK_STORE_C_2nx16(addr, val0, val1) \ - asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ - asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ - asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ - asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "Yk"(mmask)) #define _REORDER_C_2X(result_0, result_1) { \ __m512 tmp0, tmp1; \ @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); #define _MASK_STORE_C_16(addr, val0) \ - asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ - asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); #define N_STORE_4X(A, Bx, By) { \ _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ From abbc947edb830af96fc72ce7789f954737805830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Wed, 23 Feb 2022 22:51:59 +0000 Subject: [PATCH 04/41] Fix compilation of Skylake AVX512 kernels with GCC 6 --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 2 +- kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 2 +- kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 2 +- kernel/x86_64/sgemm_small_kernel_tn_skylakex.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index 5d7b3c66b..df6c65ff7 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -590,6 +590,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_nn.c +#include "../generic/gemm_small_matrix_kernel_nn.c" #endif diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index e63873988..37d1ca497 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -322,6 +322,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_tn.c +#include "../generic/gemm_small_matrix_kernel_tn.c" #endif diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 215add010..cea63172b 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -612,6 +612,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_nn.c +#include "../generic/gemm_small_matrix_kernel_nn.c" #endif diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index f394b5b3a..308f5e35e 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -316,6 +316,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_tn.c +#include "../generic/gemm_small_matrix_kernel_tn.c" #endif From d9894f45d30e82fd1491ae38477a1fcd79faeed1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 10:04:00 +0100 Subject: [PATCH 05/41] Define sbgemm_r to fix DYNAMIC_ARCH builds --- kernel/setparam-ref.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fe796be64..a81b32ddc 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1824,6 +1824,13 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif +#if BUILD_BFLOAT16==1 + TABLE_NAME.sbgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.sbgemm_p * TABLE_NAME.sbgemm_q * 4 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15); +#endif + #if BUILD_SINGLE==1 TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA From 9d7429406f0950113c989105eef9c5ee6cad01d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 10:05:36 +0100 Subject: [PATCH 06/41] Declare SHUFFLE_MAGIC_NO as const to placate clang --- kernel/x86_64/sbgemm_microk_cooperlake_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index b8ed9838e..4a4e46f44 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -356,7 +356,7 @@ void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; + const int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA From 0698212c8c7318fd76cb366d27663b2c20856748 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 15:33:02 +0100 Subject: [PATCH 07/41] Remove stray $ --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8aa6728d5..98c803e71 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -678,7 +678,7 @@ endif () set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) endif () if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) - set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) endif () GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") From 9c626e466ed52edeff947607b01a580f549dc204 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 15:36:02 +0100 Subject: [PATCH 08/41] really fix definition of SHUFFLE_MAGIC_NO --- kernel/x86_64/sbgemm_microk_cooperlake_template.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index 4a4e46f44..bd5cbb744 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -13,6 +13,8 @@ #define ONE 1.e0f #define ZERO 0.e0f +#define SHUFFLE_MAGIC_NO (const int) 0x39 + #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT #undef SBGEMM_BLOCK_KERNEL_NN_32x8xK @@ -356,7 +358,6 @@ void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - const int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -465,7 +466,6 @@ void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -1192,7 +1192,6 @@ void sbgemm_block_kernel_tn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -1291,7 +1290,6 @@ void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA From 35d5105922445adeec359d42cf5972df88e213af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Feb 2022 19:23:40 +0100 Subject: [PATCH 09/41] Enable xGEMMT functions --- relapack/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relapack/config.h b/relapack/config.h index e4fab0a12..9d6919463 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 0 +#define INCLUDE_XGEMMT 1 #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT From 4058f324923cd293b2117eafb65bc758a9a34a19 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Feb 2022 19:24:27 +0100 Subject: [PATCH 10/41] Fix xGEMMT argument lists --- relapack/src/lapack_wrappers.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 0252f3d92..fc3dbc11e 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -566,7 +566,8 @@ void LAPACK(sgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_sgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -578,7 +579,8 @@ void LAPACK(dgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_dgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -590,7 +592,8 @@ void LAPACK(cgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_cgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -602,6 +605,7 @@ void LAPACK(zgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_zgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif From 1c1ffb0591186e50311670369dee2cb450980d9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Feb 2022 19:27:34 +0100 Subject: [PATCH 11/41] Annotate LAPACKE_lsame with the const attribute for GCC and compatible compilers --- lapack-netlib/LAPACKE/include/lapacke_utils.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index a9236d23f..ec29f24fc 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -67,7 +67,11 @@ extern "C" { void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ -lapack_logical LAPACKE_lsame( char ca, char cb ); +lapack_logical LAPACKE_lsame( char ca, char cb ) +#if defined __GNUC__ + __attribute__((const)) +#endif + ; /* Functions to convert column-major to row-major 2d arrays and vice versa. */ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n, From 225683218c85a3c2246f7c66903ab3c03a3f6bfe Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 28 Feb 2022 03:22:31 +0000 Subject: [PATCH 12/41] Small Matrix: use proper inline asm input constraint for AVX512 mask --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 4 ++-- kernel/x86_64/dgemm_small_kernel_nt_skylakex.c | 4 ++-- kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 4 ++-- kernel/x86_64/sgemm_small_kernel_nt_skylakex.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index df6c65ff7..a98772b94 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ - asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) #endif @@ -266,7 +266,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp int mm = M - i; if (!mm) return 0; if (mm > 4 || K < 16) { - register __mmask8 mask asm("k1") = (1UL << mm) - 1; + register __mmask8 mask = (1UL << mm) - 1; for (j = 0; j < n6; j += 6) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c index e757197ba..9e6eb1c4d 100644 --- a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ - asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) #define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ @@ -303,7 +303,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } int mm = M - i; if (mm >= 6) { - register __mmask16 mask asm("k1") = (1UL << mm) - 1; + register __mmask16 mask = (1UL << mm) - 1; for (j = 0; j < n8; j += 8) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index cea63172b..2366fe3aa 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #endif @@ -267,7 +267,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp int mm = M - i; if (!mm) return 0; if (mm > 8 || K < 32) { - register __mmask16 mask asm("k1") = (1UL << mm) - 1; + register __mmask16 mask = (1UL << mm) - 1; for (j = 0; j < n6; j += 6) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index a7d87f8c4..bb00228de 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ @@ -303,7 +303,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } int mm = M - i; if (mm >= 12) { - register __mmask16 mask asm("k1") = (1UL << mm) - 1; + register __mmask16 mask = (1UL << mm) - 1; for (j = 0; j < n8; j += 8) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); From ed8c028f7fc30cc2327410b8051c99f8eb1f58cb Mon Sep 17 00:00:00 2001 From: Alessio Zanga Date: Sat, 5 Mar 2022 00:07:01 +0100 Subject: [PATCH 13/41] Remove MSVC limitation --- CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ab9f3af80..981130e22 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,15 +17,7 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) -if(MSVC AND NOT DEFINED NOFORTRAN) - set(NOFORTRAN ON) -endif() - ####### -if(MSVC) - option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) -endif() - option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) From ed2871cb71ee0ce2e86f6ed62bf2ebd10a1f3180 Mon Sep 17 00:00:00 2001 From: AlessioZanga Date: Sat, 5 Mar 2022 23:35:29 +0100 Subject: [PATCH 14/41] Change `BUILD_WITHOUT_LAPACK` to `OFF` by default --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 981130e22..2b99a7722 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,8 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) ####### +option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) + option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) From 2d0ad89b0d88604133260eb6df348a3b6cfdedf7 Mon Sep 17 00:00:00 2001 From: JonasZhou Date: Fri, 4 Mar 2022 17:14:52 +0800 Subject: [PATCH 15/41] Support Zhaoxin/Centaur kh40000 as ZEN Signed-off-by: JonasZhou --- cpuid_x86.c | 54 ++++++++++++++++++++++++++++++++++++++--- driver/others/dynamic.c | 27 +++++++++++++++++++-- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index d7d85eb20..4ac1de047 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1707,8 +1707,18 @@ int get_cpuname(void){ if (model == 0xf && stepping < 0xe) return CPUTYPE_NANO; return CPUTYPE_NEHALEM; + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CPUTYPE_ZEN; + else + return CPUTYPE_DUNNINGTON; + default: + return CPUTYPE_NEHALEM; + } default: - if (family >= 0x7) + if (family >= 0x8) return CPUTYPE_NEHALEM; else return CPUTYPE_VIAC3; @@ -1716,7 +1726,20 @@ int get_cpuname(void){ } if (vendor == VENDOR_ZHAOXIN){ - return CPUTYPE_NEHALEM; + switch (family) { + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CPUTYPE_ZEN; + else + return CPUTYPE_DUNNINGTON; + default: + return CPUTYPE_NEHALEM; + } + default: + return CPUTYPE_NEHALEM; + } } if (vendor == VENDOR_RISE){ @@ -2416,8 +2439,18 @@ int get_coretype(void){ if (model == 0xf && stepping < 0xe) return CORE_NANO; return CORE_NEHALEM; + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CORE_ZEN; + else + return CORE_DUNNINGTON; + default: + return CORE_NEHALEM; + } default: - if (family >= 0x7) + if (family >= 0x8) return CORE_NEHALEM; else return CORE_VIAC3; @@ -2425,7 +2458,20 @@ int get_coretype(void){ } if (vendor == VENDOR_ZHAOXIN) { - return CORE_NEHALEM; + switch (family) { + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CORE_ZEN; + else + return CORE_DUNNINGTON; + default: + return CORE_NEHALEM; + } + default: + return CORE_NEHALEM; + } } return CORE_UNKNOWN; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 52a7c6087..7d7a41de2 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -875,14 +875,37 @@ static gotoblas_t *get_coretype(void){ if (model == 0xf && stepping < 0xe) return &gotoblas_NANO; return &gotoblas_NEHALEM; + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return &gotoblas_ZEN; + else + return &gotoblas_DUNNINGTON; + default: + return &gotoblas_NEHALEM; + } default: - if (family >= 0x7) + if (family >= 0x8) return &gotoblas_NEHALEM; } } if (vendor == VENDOR_ZHAOXIN) { - return &gotoblas_NEHALEM; + switch (family) { + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return &gotoblas_ZEN; + else + return &gotoblas_DUNNINGTON; + default: + return &gotoblas_NEHALEM; + } + default: + return &gotoblas_NEHALEM; + } } return NULL; From bf4642eb7e428da5b9e8bb5fa0f1e8b8d92a3fd1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Mar 2022 16:19:29 +0100 Subject: [PATCH 16/41] Report USE_TLS if set --- driver/others/openblas_get_config.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7fefee33d..7a5cbeb62 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -60,6 +60,9 @@ static char* openblas_config_str="" #ifdef USE_OPENMP "USE_OPENMP " #endif +#ifdef USE_TLS + "USE_TLS " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif From 5cc1111383db14a59ccda5ce5140d0f631f70ad9 Mon Sep 17 00:00:00 2001 From: Caroline Newcombe Date: Fri, 11 Mar 2022 11:56:33 -0600 Subject: [PATCH 17/41] fix unsafe read of Y in assembly kernel --- kernel/x86_64/zsymv_L_sse2.S | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index bfe0cf7ee..fa61ac939 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -452,11 +452,6 @@ MOVDDUP(4 * SIZE, A1, a1) - movsd 0 * SIZE(YY), yy1 - movhpd 1 * SIZE(YY), yy1 - movsd 2 * SIZE(YY), yy2 - movhpd 3 * SIZE(YY), yy2 - movapd 8 * SIZE(XX), xtemp1 movapd 10 * SIZE(XX), xtemp2 movapd 12 * SIZE(XX), xtemp3 @@ -475,6 +470,12 @@ MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) ALIGN_3 +.L12_prep: + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + .L12: movapd xtemp1, xt1 mulpd a1, xt1 @@ -608,8 +609,6 @@ movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) - movsd 10 * SIZE(YY), yy2 - movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 @@ -621,8 +620,6 @@ movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) - movsd 8 * SIZE(YY), yy1 - movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY @@ -630,7 +627,8 @@ addq $ 8 * SIZE, A2 decq I - jg .L12 + jg .L12_prep + jmp .L15 ALIGN_3 .L14: @@ -641,7 +639,6 @@ jle .L16 MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) - jmp .L15_pastcheck .L15: movq M, I @@ -650,6 +647,11 @@ testq $2, I jle .L16 + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + .L15_pastcheck: movapd xtemp1, xt1 mulpd a1, xt1 @@ -705,8 +707,6 @@ movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) - movsd 6 * SIZE(YY), yy2 - movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 @@ -717,8 +717,6 @@ movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) - movsd 4 * SIZE(YY), yy1 - movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 @@ -731,6 +729,9 @@ MOVDDUP(1 * SIZE, A1, a2) + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 From 69f2ac4ea20d48f08e9ddefd5adcd755acf5a768 Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Thu, 17 Mar 2022 20:02:39 -0400 Subject: [PATCH 18/41] Fix broken elif in dynamic.c This fixes compilation in the following case: $(MAKE) USE_OPENMP=1 USE_THREAD=1 NO_LAPACK=0 DYNAMIC_ARCH=1 \ DYNAMIC_LIST="HASWELL SKYLAKEX ATOM COOPERLAKE SAPPHIRERAPIDS ZEN" --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7d7a41de2..df7fa67e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -96,7 +96,7 @@ extern gotoblas_t gotoblas_BARCELONA; #endif #ifdef DYN_ATOM extern gotoblas_t gotoblas_ATOM; -elif defined(DYN_NEHALEM) +#elif defined(DYN_NEHALEM) #define gotoblas_ATOM gotoblas_NEHALEM #else #define gotoblas_ATOM gotoblas_PRESCOTT From 2408315d105189963ea82e74340751eccfd22826 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Mar 2022 21:37:55 +0100 Subject: [PATCH 19/41] Skip tests if Windows powershell added a BOM --- test/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e4ee8b28b..5214d9cab 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,6 +30,10 @@ if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 "if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" "$ErrorActionPreference = \"Stop\"\n" +"If ((Get-Content $args[1] | & file - | %{$_ -match \"BOM\"}) -contains $true) {\n" +"echo 'Skipped due to wrong input encoding'\n" +"exit 0\n" +"}\n" "Get-Content $args[1] | & $args[0]\n" "If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" "echo Error\n" From a0e86adf934561ddd5e4e7aa9e41f247aff2f395 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Mar 2022 21:51:09 +0100 Subject: [PATCH 20/41] Update Windows jobs in Azure CI to use Windows2022 --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 04ed428de..10d639a4a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -65,7 +65,7 @@ jobs: - task: CMake@1 inputs: workingDirectory: 'build' # Optional - cmakeArgs: '-G "Visual Studio 16 2019" ..' + cmakeArgs: '-G "Visual Studio 17 2022" ..' - task: CMake@1 inputs: cmakeArgs: '--build . --config Release' @@ -103,7 +103,7 @@ jobs: - job: Windows_flang_clang pool: - vmImage: 'windows-latest' + vmImage: 'windows-2022' steps: - script: | set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" @@ -114,7 +114,7 @@ jobs: conda install --yes --quiet ninja flang mkdir build cd build - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. cmake --build . --config Release ctest @@ -178,7 +178,7 @@ jobs: cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. cmake --build . ctest - + - job: OSX_Ifort_Clang pool: vmImage: 'macOS-10.15' From 9fbeb88fb87dcc418c9ef01c5f24c85029dfbbef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Mar 2022 15:19:55 +0100 Subject: [PATCH 21/41] Utilize compiler AVX512 capability info from c_check when building getarch --- Makefile.prebuild | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index 399db956f..4dad74d63 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -71,7 +71,8 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + avx512=$$(perl c_check - - gcc | grep NO_AVX512); \ + $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c config.h dummy ifndef TARGET_CORE From 93a81856ae6a34c4329054744237d46ed347ccec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Mar 2022 15:22:13 +0100 Subject: [PATCH 22/41] Revert AVX512 capability check from PR #1980 (moved to build) --- getarch.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/getarch.c b/getarch.c index 00e544bc7..e49eac1a3 100644 --- a/getarch.c +++ b/getarch.c @@ -94,14 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(__x86_64__) || defined(_M_X64) -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) -#else -#ifndef NO_AVX512 -#define NO_AVX512 -#endif -#endif -#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ From c87a4dbf35c809ebe6bc88c7d8dce8f2e7b135ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Mar 2022 15:48:58 +0100 Subject: [PATCH 23/41] Fix checks for AVX512 and atomics --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index 999f5a7a7..e10ddfebc 100644 --- a/c_check +++ b/c_check @@ -254,7 +254,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { # $tmpf = new File::Temp( UNLINK => 1 ); ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; + print $fh "#include \n\nint main(void){ __asm__ volatile($code); }\n"; $args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf"; if ($compiler eq "PGI") { $args = " -tp skylake -c -o $tmpf.o $tmpf"; @@ -278,7 +278,7 @@ if ($data =~ /HAVE_C11/) { $c11_atomics = 0; } else { ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); - print $tmpf "#include \nint main(void){}\n"; + print $fh "#include \nint main(void){}\n"; $args = " -c -o $tmpf.o $tmpf"; my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); system(@cmd) == 0; From 40302558ed3d3c3f100e96dd042a5996c3d16bbd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Mar 2022 20:05:32 +0100 Subject: [PATCH 24/41] Remove extraneous (and wrong) definition of sbgemm_r on x86_64 --- kernel/setparam-ref.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index a81b32ddc..9f5d34d9b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1239,7 +1239,6 @@ static void init_parameter(void) { #ifdef BUILD_BFLOAT16 TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; - TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) From 3efbf968f15d93ceec5a64bbfbf5e7a46a1afd47 Mon Sep 17 00:00:00 2001 From: Aisha Tammy Date: Sun, 1 Nov 2020 02:43:56 +0000 Subject: [PATCH 25/41] create INDEX64 target --- lapack-netlib/BLAS/CMakeLists.txt | 4 +- lapack-netlib/BLAS/SRC/CMakeLists.txt | 6 +- lapack-netlib/BLAS/TESTING/CMakeLists.txt | 2 +- lapack-netlib/BLAS/blas.pc.in | 2 +- lapack-netlib/CBLAS/CMakeLists.txt | 30 +++++----- lapack-netlib/CBLAS/cblas.pc.in | 4 +- .../CBLAS/cmake/cblas-config-build.cmake.in | 4 +- .../CBLAS/cmake/cblas-config-install.cmake.in | 8 +-- lapack-netlib/CBLAS/examples/CMakeLists.txt | 4 +- lapack-netlib/CBLAS/examples/cblas_example1.c | 2 +- lapack-netlib/CBLAS/examples/cblas_example2.c | 2 +- lapack-netlib/CBLAS/src/CMakeLists.txt | 10 ++-- lapack-netlib/CBLAS/testing/CMakeLists.txt | 24 ++++---- .../CMAKE/lapack-config-build.cmake.in | 2 +- .../CMAKE/lapack-config-install.cmake.in | 2 +- lapack-netlib/CMakeLists.txt | 60 ++++++++++++------- lapack-netlib/LAPACKE/CMakeLists.txt | 36 +++++------ .../cmake/lapacke-config-build.cmake.in | 6 +- .../cmake/lapacke-config-install.cmake.in | 8 +-- lapack-netlib/LAPACKE/example/CMakeLists.txt | 8 +-- lapack-netlib/LAPACKE/lapacke.pc.in | 4 +- lapack-netlib/SRC/CMakeLists.txt | 14 ++--- lapack-netlib/TESTING/MATGEN/CMakeLists.txt | 6 +- 23 files changed, 133 insertions(+), 115 deletions(-) diff --git a/lapack-netlib/BLAS/CMakeLists.txt b/lapack-netlib/BLAS/CMakeLists.txt index ee5676fc6..45cec39c2 100644 --- a/lapack-netlib/BLAS/CMakeLists.txt +++ b/lapack-netlib/BLAS/CMakeLists.txt @@ -2,9 +2,9 @@ add_subdirectory(SRC) if(BUILD_TESTING) add_subdirectory(TESTING) endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/blas.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${BLASLIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/blas.pc + ${CMAKE_CURRENT_BINARY_DIR}/${BLASLIB}.pc DESTINATION ${PKG_CONFIG_DIR} COMPONENT Development ) diff --git a/lapack-netlib/BLAS/SRC/CMakeLists.txt b/lapack-netlib/BLAS/SRC/CMakeLists.txt index 41c480432..0078dca40 100644 --- a/lapack-netlib/BLAS/SRC/CMakeLists.txt +++ b/lapack-netlib/BLAS/SRC/CMakeLists.txt @@ -97,10 +97,10 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(blas ${SOURCES}) +add_library(${BLASLIB} ${SOURCES}) set_target_properties( - blas PROPERTIES + ${BLASLIB} PROPERTIES VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) -lapack_install_library(blas) +lapack_install_library(${BLASLIB}) diff --git a/lapack-netlib/BLAS/TESTING/CMakeLists.txt b/lapack-netlib/BLAS/TESTING/CMakeLists.txt index 9b130db0f..ae82cf937 100644 --- a/lapack-netlib/BLAS/TESTING/CMakeLists.txt +++ b/lapack-netlib/BLAS/TESTING/CMakeLists.txt @@ -2,7 +2,7 @@ macro(add_blas_test name src) get_filename_component(baseNAME ${src} NAME_WE) set(TEST_INPUT "${CMAKE_CURRENT_SOURCE_DIR}/${baseNAME}.in") add_executable(${name} ${src}) - target_link_libraries(${name} blas) + target_link_libraries(${name} ${BLASLIB}) if(EXISTS "${TEST_INPUT}") add_test(NAME BLAS-${name} COMMAND "${CMAKE_COMMAND}" -DTEST=$ diff --git a/lapack-netlib/BLAS/blas.pc.in b/lapack-netlib/BLAS/blas.pc.in index 37809773b..31e11e638 100644 --- a/lapack-netlib/BLAS/blas.pc.in +++ b/lapack-netlib/BLAS/blas.pc.in @@ -5,4 +5,4 @@ Name: BLAS Description: FORTRAN reference implementation of BLAS Basic Linear Algebra Subprograms Version: @LAPACK_VERSION@ URL: http://www.netlib.org/blas/ -Libs: -L${libdir} -lblas +Libs: -L${libdir} -l@BLASLIB@ diff --git a/lapack-netlib/CBLAS/CMakeLists.txt b/lapack-netlib/CBLAS/CMakeLists.txt index 04c5ab795..da46027ac 100644 --- a/lapack-netlib/CBLAS/CMakeLists.txt +++ b/lapack-netlib/CBLAS/CMakeLists.txt @@ -1,7 +1,7 @@ message(STATUS "CBLAS enable") enable_language(C) -set(LAPACK_INSTALL_EXPORT_NAME cblas-targets) +set(LAPACK_INSTALL_EXPORT_NAME ${CBLASLIB}-targets) # Create a header file cblas.h for the routines called in my C programs include(FortranCInterface) @@ -42,15 +42,15 @@ if(BUILD_TESTING) endif() if(NOT BLAS_FOUND) - set(ALL_TARGETS ${ALL_TARGETS} blas) + set(ALL_TARGETS ${ALL_TARGETS} ${BLASLIB}) endif() # Export cblas targets from the # install tree, if any. set(_cblas_config_install_guard_target "") if(ALL_TARGETS) - install(EXPORT cblas-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} + install(EXPORT ${CBLASLIB}-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION} COMPONENT Development ) # Choose one of the cblas targets to use as a guard for @@ -61,7 +61,7 @@ endif() # Export cblas targets from the build tree, if any. set(_cblas_config_build_guard_target "") if(ALL_TARGETS) - export(TARGETS ${ALL_TARGETS} FILE cblas-targets.cmake) + export(TARGETS ${ALL_TARGETS} FILE ${CBLASLIB}-targets.cmake) # Choose one of the cblas targets to use as a guard # for cblas-config.cmake to load targets from the build tree. @@ -69,26 +69,26 @@ if(ALL_TARGETS) endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-version.cmake.in - ${LAPACK_BINARY_DIR}/cblas-config-version.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${CBLASLIB}-config-version.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-build.cmake.in - ${LAPACK_BINARY_DIR}/cblas-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${CBLASLIB}-config.cmake @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${CBLASLIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc + ${CMAKE_CURRENT_BINARY_DIR}/${CBLASLIB}.pc DESTINATION ${PKG_CONFIG_DIR} ) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-install.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake @ONLY) + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${CBLASLIB}-config.cmake @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake - ${LAPACK_BINARY_DIR}/cblas-config-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${CBLASLIB}-config.cmake + ${LAPACK_BINARY_DIR}/${CBLASLIB}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION} ) -#install(EXPORT cblas-targets -# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} +#install(EXPORT ${CBLASLIB}-targets +# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION} # COMPONENT Development # ) diff --git a/lapack-netlib/CBLAS/cblas.pc.in b/lapack-netlib/CBLAS/cblas.pc.in index 7c95ebbb4..882642e6c 100644 --- a/lapack-netlib/CBLAS/cblas.pc.in +++ b/lapack-netlib/CBLAS/cblas.pc.in @@ -5,6 +5,6 @@ Name: CBLAS Description: C Standard Interface to BLAS Basic Linear Algebra Subprograms Version: @LAPACK_VERSION@ URL: http://www.netlib.org/blas/#_cblas -Libs: -L${libdir} -lcblas +Libs: -L${libdir} -l@CBLASLIB@ Cflags: -I${includedir} -Requires.private: blas +Requires.private: @BLASLIB@ diff --git a/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in index 3747f041c..dc21c2d0f 100644 --- a/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in +++ b/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in @@ -4,11 +4,11 @@ find_package(LAPACK NO_MODULE) # Load lapack targets from the build tree, including lapacke targets. if(NOT TARGET lapacke) - include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") + include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake") endif() # Report cblas header search locations from build tree. set(CBLAS_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include") # Report cblas libraries. -set(CBLAS_LIBRARIES cblas) +set(CBLAS_LIBRARIES @CBLASLIB@) diff --git a/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in index 215e28a57..44046a283 100644 --- a/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in +++ b/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in @@ -5,19 +5,19 @@ get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH) get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH) # Load the LAPACK package with which we were built. -set(LAPACK_DIR "${_CBLAS_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/lapack-@LAPACK_VERSION@") +set(LAPACK_DIR "${_CBLAS_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/@LAPACKLIB@-@LAPACK_VERSION@") find_package(LAPACK NO_MODULE) # Load lapacke targets from the install tree. -if(NOT TARGET cblas) - include(${_CBLAS_SELF_DIR}/cblas-targets.cmake) +if(NOT TARGET @CBLASLIB@) + include(${_CBLAS_SELF_DIR}/@CBLASLIB@-targets.cmake) endif() # Report lapacke header search locations. set(CBLAS_INCLUDE_DIRS ${_CBLAS_PREFIX}/include) # Report lapacke libraries. -set(CBLAS_LIBRARIES cblas) +set(CBLAS_LIBRARIES @CBLASLIB@) unset(_CBLAS_PREFIX) unset(_CBLAS_SELF_DIR) diff --git a/lapack-netlib/CBLAS/examples/CMakeLists.txt b/lapack-netlib/CBLAS/examples/CMakeLists.txt index 0241fd164..74f7d8bb8 100644 --- a/lapack-netlib/CBLAS/examples/CMakeLists.txt +++ b/lapack-netlib/CBLAS/examples/CMakeLists.txt @@ -1,8 +1,8 @@ add_executable(xexample1_CBLAS cblas_example1.c) add_executable(xexample2_CBLAS cblas_example2.c) -target_link_libraries(xexample1_CBLAS cblas) -target_link_libraries(xexample2_CBLAS cblas ${BLAS_LIBRARIES}) +target_link_libraries(xexample1_CBLAS ${CBLASLIB}) +target_link_libraries(xexample2_CBLAS ${CBLASLIB} ${BLAS_LIBRARIES}) add_test(example1_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample1_CBLAS) add_test(example2_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample2_CBLAS) diff --git a/lapack-netlib/CBLAS/examples/cblas_example1.c b/lapack-netlib/CBLAS/examples/cblas_example1.c index 3d5ed330c..d89aeadb0 100644 --- a/lapack-netlib/CBLAS/examples/cblas_example1.c +++ b/lapack-netlib/CBLAS/examples/cblas_example1.c @@ -11,7 +11,7 @@ int main ( ) double *a, *x, *y; double alpha, beta; - int m, n, lda, incx, incy, i; + CBLAS_INDEX m, n, lda, incx, incy, i; Layout = CblasColMajor; transa = CblasNoTrans; diff --git a/lapack-netlib/CBLAS/examples/cblas_example2.c b/lapack-netlib/CBLAS/examples/cblas_example2.c index d2c28d53f..e82ae518c 100644 --- a/lapack-netlib/CBLAS/examples/cblas_example2.c +++ b/lapack-netlib/CBLAS/examples/cblas_example2.c @@ -9,7 +9,7 @@ int main (int argc, char **argv ) { - int rout=-1,info=0,m,n,k,lda,ldb,ldc; + CBLAS_INDEX rout=-1,info=0,m,n,k,lda,ldb,ldc; double A[2] = {0.0,0.0}, B[2] = {0.0,0.0}, C[2] = {0.0,0.0}, diff --git a/lapack-netlib/CBLAS/src/CMakeLists.txt b/lapack-netlib/CBLAS/src/CMakeLists.txt index 90e19f818..1313e798b 100644 --- a/lapack-netlib/CBLAS/src/CMakeLists.txt +++ b/lapack-netlib/CBLAS/src/CMakeLists.txt @@ -113,16 +113,16 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(cblas ${SOURCES}) +add_library(${CBLASLIB} ${SOURCES}) set_target_properties( - cblas PROPERTIES + ${CBLASLIB} PROPERTIES LINKER_LANGUAGE C VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) -target_include_directories(cblas PUBLIC +target_include_directories(${CBLASLIB} PUBLIC $ $ ) -target_link_libraries(cblas PRIVATE ${BLAS_LIBRARIES}) -lapack_install_library(cblas) +target_link_libraries(${CBLASLIB} PRIVATE ${BLAS_LIBRARIES}) +lapack_install_library(${CBLASLIB}) diff --git a/lapack-netlib/CBLAS/testing/CMakeLists.txt b/lapack-netlib/CBLAS/testing/CMakeLists.txt index 2459695b8..34e92a423 100644 --- a/lapack-netlib/CBLAS/testing/CMakeLists.txt +++ b/lapack-netlib/CBLAS/testing/CMakeLists.txt @@ -52,9 +52,9 @@ if(BUILD_SINGLE) add_executable(xscblat2 c_sblat2.f ${STESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xscblat3 c_sblat3.f ${STESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xscblat1 cblas) - target_link_libraries(xscblat2 cblas) - target_link_libraries(xscblat3 cblas) + target_link_libraries(xscblat1 ${CBLASLIB}) + target_link_libraries(xscblat2 ${CBLASLIB}) + target_link_libraries(xscblat3 ${CBLASLIB}) add_cblas_test(stest1.out "" xscblat1) add_cblas_test(stest2.out sin2 xscblat2) @@ -66,9 +66,9 @@ if(BUILD_DOUBLE) add_executable(xdcblat2 c_dblat2.f ${DTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xdcblat3 c_dblat3.f ${DTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xdcblat1 cblas) - target_link_libraries(xdcblat2 cblas) - target_link_libraries(xdcblat3 cblas) + target_link_libraries(xdcblat1 ${CBLASLIB}) + target_link_libraries(xdcblat2 ${CBLASLIB}) + target_link_libraries(xdcblat3 ${CBLASLIB}) add_cblas_test(dtest1.out "" xdcblat1) add_cblas_test(dtest2.out din2 xdcblat2) @@ -80,9 +80,9 @@ if(BUILD_COMPLEX) add_executable(xccblat2 c_cblat2.f ${CTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xccblat3 c_cblat3.f ${CTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xccblat1 cblas ${BLAS_LIBRARIES}) - target_link_libraries(xccblat2 cblas) - target_link_libraries(xccblat3 cblas) + target_link_libraries(xccblat1 ${CBLASLIB} ${BLAS_LIBRARIES}) + target_link_libraries(xccblat2 ${CBLASLIB}) + target_link_libraries(xccblat3 ${CBLASLIB}) add_cblas_test(ctest1.out "" xccblat1) add_cblas_test(ctest2.out cin2 xccblat2) @@ -94,9 +94,9 @@ if(BUILD_COMPLEX16) add_executable(xzcblat2 c_zblat2.f ${ZTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xzcblat3 c_zblat3.f ${ZTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xzcblat1 cblas) - target_link_libraries(xzcblat2 cblas) - target_link_libraries(xzcblat3 cblas) + target_link_libraries(xzcblat1 ${CBLASLIB}) + target_link_libraries(xzcblat2 ${CBLASLIB}) + target_link_libraries(xzcblat3 ${CBLASLIB}) add_cblas_test(ztest1.out "" xzcblat1) add_cblas_test(ztest2.out zin2 xzcblat2) diff --git a/lapack-netlib/CMAKE/lapack-config-build.cmake.in b/lapack-netlib/CMAKE/lapack-config-build.cmake.in index f7e041663..da44a6ae4 100644 --- a/lapack-netlib/CMAKE/lapack-config-build.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-build.cmake.in @@ -1,7 +1,7 @@ # Load lapack targets from the build tree if necessary. set(_LAPACK_TARGET "@_lapack_config_build_guard_target@") if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") - include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") + include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake") endif() unset(_LAPACK_TARGET) diff --git a/lapack-netlib/CMAKE/lapack-config-install.cmake.in b/lapack-netlib/CMAKE/lapack-config-install.cmake.in index 3de7362ea..77609609c 100644 --- a/lapack-netlib/CMAKE/lapack-config-install.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-install.cmake.in @@ -4,7 +4,7 @@ get_filename_component(_LAPACK_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) # Load lapack targets from the install tree if necessary. set(_LAPACK_TARGET "@_lapack_config_install_guard_target@") if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") - include("${_LAPACK_SELF_DIR}/lapack-targets.cmake") + include("${_LAPACK_SELF_DIR}/@LAPACKLIB@-targets.cmake") endif() unset(_LAPACK_TARGET) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index df43d91b1..a30efbbfe 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -44,6 +44,24 @@ endif() # By default static library option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +# By default build index32 library +option(BUILD_INDEX64 "Build Index-64 API libraries" OFF) +if(BUILD_INDEX64) + set(BLASLIB "blas64") + set(CBLASLIB "cblas64") + set(LAPACKLIB "lapack64") + set(LAPACKELIB "lapacke64") + set(TMGLIB "tmglib64") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWeirdNEC -DLAPACK_ILP64 -DHAVE_LAPACK_CONFIG_H") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fdefault-integer-8") +else() + set(BLASLIB "blas") + set(CBLASLIB "cblas") + set(LAPACKLIB "lapack") + set(LAPACKELIB "lapacke") + set(TMGLIB "tmglib") +endif() + include(GNUInstallDirs) # Updated OSX RPATH settings @@ -73,10 +91,10 @@ include(PreventInBuildInstalls) if(UNIX) if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - list(APPEND CMAKE_Fortran_FLAGS "-fp-model strict") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") endif() if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - list(APPEND CMAKE_Fortran_FLAGS "-qnosave -qstrict=none") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") endif() # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin @@ -112,7 +130,7 @@ endif() # -------------------------------------------------- -set(LAPACK_INSTALL_EXPORT_NAME lapack-targets) +set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets) macro(lapack_install_library lib) install(TARGETS ${lib} @@ -220,7 +238,7 @@ endif() if(NOT BLAS_FOUND) message(STATUS "Using supplied NETLIB BLAS implementation") add_subdirectory(BLAS) - set(BLAS_LIBRARIES blas) + set(BLAS_LIBRARIES ${BLASLIB}) else() set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${BLAS_LINKER_FLAGS}" @@ -279,7 +297,7 @@ endif() # Neither user specified or optimized LAPACK libraries can be used if(NOT LATESTLAPACK_FOUND) message(STATUS "Using supplied NETLIB LAPACK implementation") - set(LAPACK_LIBRARIES lapack) + set(LAPACK_LIBRARIES ${LAPACKLIB}) add_subdirectory(SRC) else() set(CMAKE_EXE_LINKER_FLAGS @@ -371,23 +389,23 @@ include(CPack) # -------------------------------------------------- if(NOT BLAS_FOUND) - set(ALL_TARGETS ${ALL_TARGETS} blas) + set(ALL_TARGETS ${ALL_TARGETS} ${BLASLIB}) endif() if(NOT LATESTLAPACK_FOUND) - set(ALL_TARGETS ${ALL_TARGETS} lapack) + set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB}) endif() if(BUILD_TESTING OR LAPACKE_WITH_TMG) - set(ALL_TARGETS ${ALL_TARGETS} tmglib) + set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB}) endif() # Export lapack targets, not including lapacke, from the # install tree, if any. set(_lapack_config_install_guard_target "") if(ALL_TARGETS) - install(EXPORT lapack-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION} + install(EXPORT ${LAPACKLIB}-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION} COMPONENT Development ) @@ -398,18 +416,18 @@ endif() # Include cblas in targets exported from the build tree. if(CBLAS) - set(ALL_TARGETS ${ALL_TARGETS} cblas) + set(ALL_TARGETS ${ALL_TARGETS} ${CBLASLIB}) endif() # Include lapacke in targets exported from the build tree. if(LAPACKE) - set(ALL_TARGETS ${ALL_TARGETS} lapacke) + set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB}) endif() # Export lapack and lapacke targets from the build tree, if any. set(_lapack_config_build_guard_target "") if(ALL_TARGETS) - export(TARGETS ${ALL_TARGETS} FILE lapack-targets.cmake) + export(TARGETS ${ALL_TARGETS} FILE ${LAPACKLIB}-targets.cmake) # Choose one of the lapack or lapacke targets to use as a guard # for lapack-config.cmake to load targets from the build tree. @@ -417,30 +435,30 @@ if(ALL_TARGETS) endif() configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-build.cmake.in - ${LAPACK_BINARY_DIR}/lapack-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${LAPACKLIB}-config.cmake @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKLIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc + ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKLIB}.pc DESTINATION ${PKG_CONFIG_DIR} COMPONENT Development ) configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in - ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/CMakeFiles/${LAPACKLIB}-config.cmake @ONLY) include(CMakePackageConfigHelpers) write_basic_package_version_file( - ${LAPACK_BINARY_DIR}/lapack-config-version.cmake + ${LAPACK_BINARY_DIR}/${LAPACKLIB}-config-version.cmake VERSION ${LAPACK_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES - ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake - ${LAPACK_BINARY_DIR}/lapack-config-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION} + ${LAPACK_BINARY_DIR}/CMakeFiles/${LAPACKLIB}-config.cmake + ${LAPACK_BINARY_DIR}/${LAPACKLIB}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION} COMPONENT Development ) diff --git a/lapack-netlib/LAPACKE/CMakeLists.txt b/lapack-netlib/LAPACKE/CMakeLists.txt index 0589a74ba..60d5ddbe3 100644 --- a/lapack-netlib/LAPACKE/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/CMakeLists.txt @@ -1,7 +1,7 @@ message(STATUS "LAPACKE enable") enable_language(C) -set(LAPACK_INSTALL_EXPORT_NAME lapacke-targets) +set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets) # Create a header file lapacke_mangling.h for the routines called in my C programs include(FortranCInterface) @@ -72,28 +72,28 @@ if(LAPACKE_WITH_TMG) endif() list(APPEND SOURCES ${UTILS}) -add_library(lapacke ${SOURCES}) +add_library(${LAPACKELIB} ${SOURCES}) set_target_properties( - lapacke PROPERTIES + ${LAPACKELIB} PROPERTIES LINKER_LANGUAGE C VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) -target_include_directories(lapacke PUBLIC +target_include_directories(${LAPACKELIB} PUBLIC $ $ ) if(WIN32 AND NOT UNIX) - target_compile_definitions(lapacke PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE) + target_compile_definitions(${LAPACKELIB} PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE) message(STATUS "Windows BUILD") endif() if(LAPACKE_WITH_TMG) - target_link_libraries(lapacke PRIVATE tmglib) + target_link_libraries(${LAPACKELIB} PRIVATE ${TMGLIB}) endif() -target_link_libraries(lapacke PRIVATE ${LAPACK_LIBRARIES}) +target_link_libraries(${LAPACKELIB} PRIVATE ${LAPACK_LIBRARIES}) -lapack_install_library(lapacke) +lapack_install_library(${LAPACKELIB}) install( FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} @@ -105,28 +105,28 @@ if(BUILD_TESTING) endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKELIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc + ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKELIB}.pc DESTINATION ${PKG_CONFIG_DIR} COMPONENT Development ) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-version.cmake.in - ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${LAPACKELIB}-config-version.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-build.cmake.in - ${LAPACK_BINARY_DIR}/lapacke-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${LAPACKELIB}-config.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-install.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake @ONLY) + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${LAPACKELIB}-config.cmake @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake - ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION} + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${LAPACKELIB}-config.cmake + ${LAPACK_BINARY_DIR}/${LAPACKELIB}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKELIB}-${LAPACK_VERSION} COMPONENT Development ) -install(EXPORT lapacke-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION} +install(EXPORT ${LAPACKELIB}-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKELIB}-${LAPACK_VERSION} COMPONENT Development ) diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in index 0a1350172..49ce4770a 100644 --- a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in +++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in @@ -3,8 +3,8 @@ set(LAPACK_DIR "@LAPACK_BINARY_DIR@") find_package(LAPACK NO_MODULE) # Load lapack targets from the build tree, including lapacke targets. -if(NOT TARGET lapacke) - include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") +if(NOT TARGET @LAPACKELIB@) + include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake") endif() # Hint for project building against lapack @@ -14,4 +14,4 @@ set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID}) set(LAPACKE_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include") # Report lapacke libraries. -set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES}) +set(LAPACKE_LIBRARIES @LAPACKELIB@ ${LAPACK_LIBRARIES}) diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in index 57a5c2b2f..2e5c36fa1 100644 --- a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in +++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in @@ -5,12 +5,12 @@ get_filename_component(_LAPACKE_PREFIX "${_LAPACKE_PREFIX}" PATH) get_filename_component(_LAPACKE_PREFIX "${_LAPACKE_PREFIX}" PATH) # Load the LAPACK package with which we were built. -set(LAPACK_DIR "${_LAPACKE_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/lapack-@LAPACK_VERSION@") +set(LAPACK_DIR "${_LAPACKE_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/@LAPACK@-@LAPACK_VERSION@") find_package(LAPACK NO_MODULE) # Load lapacke targets from the install tree. -if(NOT TARGET lapacke) - include(${_LAPACKE_SELF_DIR}/lapacke-targets.cmake) +if(NOT TARGET @LAPACKELIB@) + include(${_LAPACKE_SELF_DIR}/@LAPACKELIB@-targets.cmake) endif() # Hint for project building against lapack @@ -20,7 +20,7 @@ set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID}) set(LAPACKE_INCLUDE_DIRS ${_LAPACKE_PREFIX}/include) # Report lapacke libraries. -set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES}) +set(LAPACKE_LIBRARIES @LAPACKELIB@ ${LAPACK_LIBRARIES}) unset(_LAPACKE_PREFIX) unset(_LAPACKE_SELF_DIR) diff --git a/lapack-netlib/LAPACKE/example/CMakeLists.txt b/lapack-netlib/LAPACKE/example/CMakeLists.txt index fa75c731c..27db8ee21 100644 --- a/lapack-netlib/LAPACKE/example/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/example/CMakeLists.txt @@ -3,10 +3,10 @@ add_executable(xexample_DGESV_colmajor example_DGESV_colmajor.c lapacke_example_ add_executable(xexample_DGELS_rowmajor example_DGELS_rowmajor.c lapacke_example_aux.c lapacke_example_aux.h) add_executable(xexample_DGELS_colmajor example_DGELS_colmajor.c lapacke_example_aux.c lapacke_example_aux.h) -target_link_libraries(xexample_DGESV_rowmajor lapacke) -target_link_libraries(xexample_DGESV_colmajor lapacke) -target_link_libraries(xexample_DGELS_rowmajor lapacke) -target_link_libraries(xexample_DGELS_colmajor lapacke) +target_link_libraries(xexample_DGESV_rowmajor ${LAPACKELIB}) +target_link_libraries(xexample_DGESV_colmajor ${LAPACKELIB}) +target_link_libraries(xexample_DGELS_rowmajor ${LAPACKELIB}) +target_link_libraries(xexample_DGELS_colmajor ${LAPACKELIB}) add_test(example_DGESV_rowmajor ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample_DGESV_rowmajor) add_test(example_DGESV_colmajor ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample_DGESV_colmajor) diff --git a/lapack-netlib/LAPACKE/lapacke.pc.in b/lapack-netlib/LAPACKE/lapacke.pc.in index 68da73957..0097c2597 100644 --- a/lapack-netlib/LAPACKE/lapacke.pc.in +++ b/lapack-netlib/LAPACKE/lapacke.pc.in @@ -5,6 +5,6 @@ Name: LAPACKE Description: C Standard Interface to LAPACK Linear Algebra PACKage Version: @LAPACK_VERSION@ URL: http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack -Libs: -L${libdir} -llapacke +Libs: -L${libdir} -l@LAPACKELIB@ Cflags: -I${includedir} -Requires.private: lapack +Requires.private: @LAPACKLIB@ diff --git a/lapack-netlib/SRC/CMakeLists.txt b/lapack-netlib/SRC/CMakeLists.txt index f19bdd302..bb1459165 100644 --- a/lapack-netlib/SRC/CMakeLists.txt +++ b/lapack-netlib/SRC/CMakeLists.txt @@ -500,21 +500,21 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(lapack ${SOURCES}) +add_library(${LAPACKLIB} ${SOURCES}) set_target_properties( - lapack PROPERTIES + ${LAPACKLIB} PROPERTIES VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) if(USE_XBLAS) - target_link_libraries(lapack PRIVATE ${XBLAS_LIBRARY}) + target_link_libraries(${LAPACKLIB} PRIVATE ${XBLAS_LIBRARY}) endif() -target_link_libraries(lapack PRIVATE ${BLAS_LIBRARIES}) +target_link_libraries(${LAPACKLIB} PRIVATE ${BLAS_LIBRARIES}) if(_is_coverage_build) - target_link_libraries(lapack PRIVATE gcov) - add_coverage(lapack) + target_link_libraries(${LAPACKLIB} PRIVATE gcov) + add_coverage(${LAPACKLIB}) endif() -lapack_install_library(lapack) +lapack_install_library(${LAPACKLIB}) diff --git a/lapack-netlib/TESTING/MATGEN/CMakeLists.txt b/lapack-netlib/TESTING/MATGEN/CMakeLists.txt index bc986da3a..3639c0320 100644 --- a/lapack-netlib/TESTING/MATGEN/CMakeLists.txt +++ b/lapack-netlib/TESTING/MATGEN/CMakeLists.txt @@ -47,6 +47,6 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(tmglib ${SOURCES}) -target_link_libraries(tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) -lapack_install_library(tmglib) +add_library(${TMGLIB} ${SOURCES}) +target_link_libraries(${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) +lapack_install_library(${TMGLIB}) From 8fe3555792f7a30df787721fe8e9e63883e5b58a Mon Sep 17 00:00:00 2001 From: "Larson, Eric" Date: Fri, 24 Sep 2021 13:03:59 -0700 Subject: [PATCH 26/41] ILP support long's in windows are 4 bytes (MSVS, intel compilers). Use int64_t and int32_t to ensure 8 byte integers for ILP interface. support 8 byte integer flag for intel ifort compiler --- lapack-netlib/CBLAS/include/cblas.h | 5 +++-- lapack-netlib/CBLAS/include/cblas_f77.h | 8 +++++++- lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake | 13 +++++++++++++ lapack-netlib/CMakeLists.txt | 2 +- lapack-netlib/LAPACKE/include/lapacke_config.h | 5 +++-- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/lapack-netlib/CBLAS/include/cblas.h b/lapack-netlib/CBLAS/include/cblas.h index 9e937964e..7593064f1 100644 --- a/lapack-netlib/CBLAS/include/cblas.h +++ b/lapack-netlib/CBLAS/include/cblas.h @@ -1,6 +1,7 @@ #ifndef CBLAS_H #define CBLAS_H #include +#include #ifdef __cplusplus @@ -11,9 +12,9 @@ extern "C" { /* Assume C declarations for C++ */ * Enumerated and derived types */ #ifdef WeirdNEC - #define CBLAS_INDEX long + #define CBLAS_INDEX int64_t #else - #define CBLAS_INDEX int + #define CBLAS_INDEX int32_t #endif typedef enum {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; diff --git a/lapack-netlib/CBLAS/include/cblas_f77.h b/lapack-netlib/CBLAS/include/cblas_f77.h index 36d4a7118..bb3f3a45d 100644 --- a/lapack-netlib/CBLAS/include/cblas_f77.h +++ b/lapack-netlib/CBLAS/include/cblas_f77.h @@ -9,6 +9,8 @@ #ifndef CBLAS_F77_H #define CBLAS_F77_H +#include + #ifdef CRAY #include #define F77_CHAR _fcd @@ -17,8 +19,12 @@ #define F77_STRLEN(a) (_fcdlen) #endif +#ifndef F77_INT #ifdef WeirdNEC - #define F77_INT long + #define F77_INT int64_t +#else + #define F77_INT int32_t +#endif #endif #ifdef F77_CHAR diff --git a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake index add0d1797..15a8f01d6 100644 --- a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake +++ b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake @@ -14,6 +14,19 @@ macro( CheckLAPACKCompilerFlags ) set( FPE_EXIT FALSE ) +# FORTRAN ILP default +if ( FORTRAN_ILP ) + if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) + if ( WIN32 ) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} /integer-size:64") + else () + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -integer-size 64") + endif() + else() + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fdefault-integer-8") + endif() +endif() + # GNU Fortran if( CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" ) if( "${CMAKE_Fortran_FLAGS}" MATCHES "-ffpe-trap=[izoupd]") diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index a30efbbfe..b704e72c5 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -53,7 +53,7 @@ if(BUILD_INDEX64) set(LAPACKELIB "lapacke64") set(TMGLIB "tmglib64") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWeirdNEC -DLAPACK_ILP64 -DHAVE_LAPACK_CONFIG_H") - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fdefault-integer-8") + set(FORTRAN_ILP TRUE) else() set(BLASLIB "blas") set(CBLASLIB "cblas") diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 8262c3488..c6542955e 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -49,12 +49,13 @@ extern "C" { #endif /* __cplusplus */ #include +#include #ifndef lapack_int #if defined(LAPACK_ILP64) -#define lapack_int long +#define lapack_int int64_t #else -#define lapack_int int +#define lapack_int int32_t #endif #endif From 4199ca728e50496ccabef2010b732e9245258ff8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 24 Mar 2022 21:23:28 +0100 Subject: [PATCH 27/41] Add LAPACK-like option to omit the LAPACK testsuite --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b99a7722..c4e51b747 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers) ####### option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) +option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) + option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) @@ -311,7 +313,9 @@ if (NOT NOFORTRAN) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() - add_subdirectory(lapack-netlib/TESTING) + if (BUILD_TESTING) + add_subdirectory(lapack-netlib/TESTING) + endif() if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) add_subdirectory(cpp_thread_test) endif() From 6c3842a891b489f63bb5c71468bdab4a3acfe387 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 24 Mar 2022 21:25:16 +0100 Subject: [PATCH 28/41] Disable the LAPACK testsuite for the Windows clang/flang build as it takes too long --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 10d639a4a..1545d56db 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -115,7 +115,7 @@ jobs: mkdir build cd build call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. cmake --build . --config Release ctest From aeb561d234fd720bd094216e11f7784c1579d0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 25 Mar 2022 13:37:15 +0100 Subject: [PATCH 29/41] Add support for Intel Fortran compilers. Port changes from upstream Reference-LAPACK. --- cmake/fc.cmake | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 9feda9be3..94199605d 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -67,7 +67,15 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") + if (WIN32) + set(FCOMMON_OPT "${FCOMMON_OPT} /integer-size:64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -integer-size 64") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -m32") From 8d0f7f0176881956c382fb343292638d4dd9f3a3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Mar 2022 13:10:47 +0200 Subject: [PATCH 30/41] Revert accidental change of generic ARMV8 DGEMM parameters from #3425 --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 8649e4486..f5cbe96b6 100644 --- a/param.h +++ b/param.h @@ -3423,8 +3423,8 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 From 09b8545fc51316d0fecf34c9e753b8a20358a3e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Mar 2022 15:24:40 +0200 Subject: [PATCH 31/41] Add initial support for M1 on Linux, Phytium FT2xxx series, ARM Cortex 510/710/X1/X2 --- Makefile.arm64 | 44 ++++++++++++++++++++++++++ TargetList.txt | 7 ++++ c_check | 1 + cpuid_arm64.c | 76 ++++++++++++++++++++++++++++++++++++++------ getarch.c | 86 ++++++++++++++++++++++++++++++++++++++++++-------- param.h | 4 +-- 6 files changed, 193 insertions(+), 25 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 2eade8d78..96e31a4fb 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -55,6 +55,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif endif +ifeq ($(CORE), FT2000) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif + # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) @@ -229,6 +236,43 @@ endif endif endif +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXX1) +CCOMMON_OPT += -march=armv9 -mtune=cortexx1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv9 -mtune=cortexx1 +endif +endif +endif + +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXX2) +CCOMMON_OPT += -march=armv9 -mtune=cortexx2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv9 -mtune=cortexx2 +endif +endif +endif + +#ifeq (1, $(filter 1,$(ISCLANG))) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXA510) +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a+sve +endif +endif +endif + +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXA710) +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortexa710 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortexa710 +endif +endif +endif + endif endif diff --git a/TargetList.txt b/TargetList.txt index a5a07a661..a297fd0e8 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -92,6 +92,10 @@ CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 +CORTEXA510 +CORTEXA710 +CORTEXX1 +CORTEXX2 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 @@ -103,6 +107,9 @@ THUNDERX2T99 TSV110 THUNDERX3T110 VORTEX +A64FX +ARMV8SVE +FT2000 9.System Z: ZARCH_GENERIC diff --git a/c_check b/c_check index e10ddfebc..f9d3f2ca2 100644 --- a/c_check +++ b/c_check @@ -316,6 +316,7 @@ if ($architecture ne $hostarch) { } $cross = 1 if ($os ne $hostos); +$cross = 0 if (($os eq "Android") && ($hostos eq "Linux") && ($ENV{TERMUX_APP_PID} != "")); $openmp = "" if $ENV{USE_OPENMP} != 1; diff --git a/cpuid_arm64.c b/cpuid_arm64.c index cc3a82815..89ec18632 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -45,6 +45,10 @@ size_t length64=sizeof(value64); #define CPU_NEOVERSEN1 11 #define CPU_NEOVERSEV1 16 #define CPU_NEOVERSEN2 17 +#define CPU_CORTEXX1 18 +#define CPU_CORTEXX2 19 +#define CPU_CORTEXA510 20 +#define CPU_CORTEXA710 21 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -59,6 +63,8 @@ size_t length64=sizeof(value64); #define CPU_VORTEX 13 // Fujitsu #define CPU_A64FX 15 +// Phytium +#define CPU_FT2000 22 static char *cpuname[] = { "UNKNOWN", @@ -73,12 +79,17 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", - "NEOVERSEV1" - "NEOVERSEN2" "THUNDERX3T110", "VORTEX", "CORTEXA55", - "A64FX" + "A64FX", + "NEOVERSEV1", + "NEOVERSEN2", + "CORTEXX1", + "CORTEXX2", + "CORTEXA510", + "CORTEXA710", + "FT2000" }; static char *cpuname_lower[] = { @@ -94,12 +105,17 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", - "neoversev1", - "neoversen2", "thunderx3t110", "vortex", "cortexa55", - "a64fx" + "a64fx", + "neoversev1", + "neoversen2", + "cortexx1", + "cortexx2", + "cortexa510", + "cortexa710", + "ft2000" }; int get_feature(char *search) @@ -182,6 +198,14 @@ int detect(void) return CPU_NEOVERSEN2; else if (strstr(cpu_part, "0xd05")) return CPU_CORTEXA55; + else if (strstr(cpu_part, "0xd46")) + return CPU_CORTEXA510; + else if (strstr(cpu_part, "0xd47")) + return CPU_CORTEXA710; + else if (strstr(cpu_part, "0xd44")) + return CPU_CORTEXX1; + else if (strstr(cpu_part, "0xd4c")) + return CPU_CORTEXX2; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -202,6 +226,13 @@ int detect(void) // Fujitsu else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) return CPU_A64FX; + // Apple + else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022")) + return CPU_VORTEX; + // Phytium + else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661") + || strstr(cpu_part, "0x662") || strstr(cpu_part, "0x663"))) + return CPU_FT2000; } p = (char *) NULL ; @@ -382,7 +413,24 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); break; - + case CPU_CORTEXA510: + case CPU_CORTEXA710: + case CPU_CORTEXX1: + case CPU_CORTEXX2: + printf("#define ARMV9\n"); + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; case CPU_FALKOR: printf("#define FALKOR\n"); printf("#define L1_CODE_SIZE 65536\n"); @@ -469,9 +517,9 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; -#ifdef __APPLE__ case CPU_VORTEX: printf("#define VORTEX \n"); +#ifdef __APPLE__ sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); printf("#define L1_CODE_SIZE %lld \n",value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); @@ -480,10 +528,10 @@ void get_cpuconfig(void) printf("#define L1_DATA_SIZE %lld \n",value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); printf("#define L2_SIZE %lld \n",value64); +#endif printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; -#endif case CPU_A64FX: printf("#define A64FX\n"); printf("#define L1_CODE_SIZE 65535\n"); @@ -494,6 +542,16 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_FT2000: + printf("#define FT2000\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 33554432\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; } get_cpucount(); } diff --git a/getarch.c b/getarch.c index e49eac1a3..26a2dd45e 100644 --- a/getarch.c +++ b/getarch.c @@ -1232,7 +1232,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa53" #define CORENAME "CORTEXA53" -#else #endif #ifdef FORCE_CORTEXA57 @@ -1248,7 +1247,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa57" #define CORENAME "CORTEXA57" -#else #endif #ifdef FORCE_CORTEXA72 @@ -1264,7 +1262,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa72" #define CORENAME "CORTEXA72" -#else #endif #ifdef FORCE_CORTEXA73 @@ -1280,7 +1277,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa73" #define CORENAME "CORTEXA73" -#else +#endif + +#ifdef FORCE_CORTEXX1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXX1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXX1 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexx1" +#define CORENAME "CORTEXX1" +#endif + +#ifdef FORCE_CORTEXX2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXX2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXX2 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexx2" +#define CORENAME "CORTEXX2" +#endif + +#ifdef FORCE_CORTEXA510 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA510" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA510 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexa510" +#define CORENAME "CORTEXA510" +#endif + +#ifdef FORCE_CORTEXA710 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA710" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA710 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexa710" +#define CORENAME "CORTEXA710" #endif #ifdef FORCE_NEOVERSEN1 @@ -1297,7 +1349,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-march=armv8.2-a -mtune=neoverse-n1" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" -#else #endif #ifdef FORCE_NEOVERSEV1 @@ -1314,7 +1365,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-march=armv8.4-a -mtune=neoverse-v1" #define LIBNAME "neoversev1" #define CORENAME "NEOVERSEV1" -#else #endif @@ -1332,7 +1382,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-march=armv8.5-a -mtune=neoverse-n2" #define LIBNAME "neoversen2" #define CORENAME "NEOVERSEN2" -#else #endif #ifdef FORCE_CORTEXA55 @@ -1348,7 +1397,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa55" #define CORENAME "CORTEXA55" -#else #endif #ifdef FORCE_FALKOR @@ -1364,7 +1412,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "falkor" #define CORENAME "FALKOR" -#else #endif #ifdef FORCE_THUNDERX @@ -1379,7 +1426,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx" #define CORENAME "THUNDERX" -#else #endif #ifdef FORCE_THUNDERX2T99 @@ -1397,7 +1443,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx2t99" #define CORENAME "THUNDERX2T99" -#else #endif #ifdef FORCE_TSV110 @@ -1413,7 +1458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "tsv110" #define CORENAME "TSV110" -#else #endif #ifdef FORCE_EMAG8180 @@ -1448,7 +1492,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx3t110" #define CORENAME "THUNDERX3T110" -#else #endif #ifdef FORCE_VORTEX @@ -1480,7 +1523,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" #define LIBNAME "a64fx" #define CORENAME "A64FX" -#else +#endif + +#ifdef FORCE_FT2000 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "FT2000" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DFT2000 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=33554426-DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "ft2000" +#define CORENAME "FT2000" #endif #ifdef FORCE_ZARCH_GENERIC diff --git a/param.h b/param.h index f5cbe96b6..792c178ba 100644 --- a/param.h +++ b/param.h @@ -3130,7 +3130,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3377,7 +3377,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) +#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510) /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ From 57dd92a662b8171498b94ef8855612832c2a152b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Mar 2022 15:26:42 +0200 Subject: [PATCH 32/41] Add initial support for ARMV9 Cortex 510/710/X1/X2 --- cmake/cc.cmake | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 06bc14986..2d3f7f1e4 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -161,6 +161,30 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL CORTEXA510) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL CORTEXA710) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL CORTEXX1) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL CORTEXX2) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + if (${CORE} STREQUAL POWER10) if (NOT DYNAMIC_ARCH) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) From b3b4672c30f613c0043ad0557d33a34ffa3bbd0d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Mar 2022 15:29:20 +0200 Subject: [PATCH 33/41] Add initial support for Phytium FT2000 series and ARMV9 Cortex 510/710/X1/X2 --- kernel/arm64/KERNEL.CORTEXA510 | 216 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.CORTEXA710 | 216 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.CORTEXX1 | 216 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.CORTEXX2 | 216 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.FT2000 | 3 + 5 files changed, 867 insertions(+) create mode 100644 kernel/arm64/KERNEL.CORTEXA510 create mode 100644 kernel/arm64/KERNEL.CORTEXA710 create mode 100644 kernel/arm64/KERNEL.CORTEXX1 create mode 100644 kernel/arm64/KERNEL.CORTEXX2 create mode 100644 kernel/arm64/KERNEL.FT2000 diff --git a/kernel/arm64/KERNEL.CORTEXA510 b/kernel/arm64/KERNEL.CORTEXA510 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA510 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.CORTEXA710 b/kernel/arm64/KERNEL.CORTEXA710 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA710 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.CORTEXX1 b/kernel/arm64/KERNEL.CORTEXX1 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXX1 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.CORTEXX2 b/kernel/arm64/KERNEL.CORTEXX2 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXX2 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.FT2000 b/kernel/arm64/KERNEL.FT2000 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.FT2000 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + From 0b69fa6ddf7fc6d92bc42ef085f39337a4489f3c Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Mon, 28 Mar 2022 08:14:52 +0200 Subject: [PATCH 34/41] Use CC and full command line instead of hard-coding gcc for AVX512 checking Hard-coding gcc may not provide incorrect results when a different compiler for the target build is used. To remain in sync with the main call to c_check, pass the full command line. Signed-off-by: Egbert Eich --- Makefile.prebuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index 4dad74d63..5e8874d42 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -71,7 +71,7 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - avx512=$$(perl c_check - - gcc | grep NO_AVX512); \ + avx512=$$(perl c_check - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c config.h dummy From 848722926c27ec385df749f032a56d59a10ecb1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 28 Mar 2022 17:18:56 +0200 Subject: [PATCH 35/41] CortexX1 is only ARMV8 --- Makefile.arm64 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 96e31a4fb..9844d2083 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -238,18 +238,18 @@ endif ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) ifeq ($(CORE), CORTEXX1) -CCOMMON_OPT += -march=armv9 -mtune=cortexx1 +CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv9 -mtune=cortexx1 +FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 endif endif endif ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) ifeq ($(CORE), CORTEXX2) -CCOMMON_OPT += -march=armv9 -mtune=cortexx2 +CCOMMON_OPT += -march=armv8.4-a+sve ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv9 -mtune=cortexx2 +FCOMMON_OPT += -march=armv8.4-a+sve endif endif endif @@ -266,9 +266,9 @@ endif ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) ifeq ($(CORE), CORTEXA710) -CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortexa710 +CCOMMON_OPT += -march=armv8.4-a+sve ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortexa710 +FCOMMON_OPT += -march=armv8.4-a+sve endif endif endif From 115bc9b98fe5182a077958d494eaace019c689c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 28 Mar 2022 17:28:29 +0200 Subject: [PATCH 36/41] CortexX1 is ARMV8 like A7x --- kernel/arm64/KERNEL.CORTEXX1 | 217 +---------------------------------- 1 file changed, 1 insertion(+), 216 deletions(-) diff --git a/kernel/arm64/KERNEL.CORTEXX1 b/kernel/arm64/KERNEL.CORTEXX1 index bd25f7cd8..a077ab4f3 100644 --- a/kernel/arm64/KERNEL.CORTEXX1 +++ b/kernel/arm64/KERNEL.CORTEXX1 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.CORTEXA57 From 48e421934f38215993538ce3e6979a2b14a6bbbe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 28 Mar 2022 17:31:26 +0200 Subject: [PATCH 37/41] CortexX1 is only ArmV8 --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 26a2dd45e..4af986fb3 100644 --- a/getarch.c +++ b/getarch.c @@ -1288,7 +1288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexx1" #define CORENAME "CORTEXX1" #endif From d93cf7f23c6633dcffe5b98bc1cd6abd183aea9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 28 Mar 2022 17:37:06 +0200 Subject: [PATCH 38/41] fix defines for CORTEX-X --- param.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/param.h b/param.h index 792c178ba..03b98f41f 100644 --- a/param.h +++ b/param.h @@ -3128,7 +3128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -#if defined(CORTEXA57) || \ +#if defined(CORTEXA57) || defined(CORTEXX1) \ defined(CORTEXA72) || defined(CORTEXA73) || \ defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) @@ -3147,7 +3147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 @@ -3377,7 +3377,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510) +#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ From abbc65cff4825686bc5684453109b315b1682d17 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 28 Mar 2022 17:40:27 +0200 Subject: [PATCH 39/41] Cortex X1 is only Arm8.2 --- cmake/cc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 2d3f7f1e4..57e42781d 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -175,7 +175,7 @@ endif () if (${CORE} STREQUAL CORTEXX1) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") endif () endif () From a55a06c2694ed6835d0341de817e8df394b3978f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 28 Mar 2022 18:10:08 +0200 Subject: [PATCH 40/41] Update param.h --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 03b98f41f..09170ba23 100644 --- a/param.h +++ b/param.h @@ -3128,7 +3128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -#if defined(CORTEXA57) || defined(CORTEXX1) \ +#if defined(CORTEXA57) || defined(CORTEXX1) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) From 5e6d1600203f9ce48987736a4bbcd5904bd3ca7c Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Sun, 13 Mar 2022 10:57:59 +0100 Subject: [PATCH 41/41] Do not include symbols defined in driver/others/parameter.c in DYNAMIC_ARCH driver/others/parameter.c does not get build during DYNAMIC_ARCH, thus, do not declare its symbols. This will make the build fail early and in an obvious way if functions are trying to use these symbols. Signed-off-by: Egbert Eich --- common_macro.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common_macro.h b/common_macro.h index 9826f1809..d2fa822c2 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2610,8 +2610,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ -|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) +#if !defined(DYNAMIC_ARCH) \ + && (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \ + || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p;