diff --git a/Makefile.system b/Makefile.system index 7d26eccc3..62926b380 100644 --- a/Makefile.system +++ b/Makefile.system @@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += ARMV8SVE endif DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e6e434a0a..07df31b89 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index ef2597234..530d18115 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; #else #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #endif +#ifdef DYN_ARMV8SVE +extern gotoblas_t gotoblas_ARMV8SVE; +#else +#define gotoblas_ARMV8SVE gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_CORTEXA55; @@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 16 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -173,6 +181,7 @@ static char *corename[] = { "neoversen2", "thunderx3t110", "cortexa55", + "armv8sve", "unknown" }; @@ -192,6 +201,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; return corename[NUM_CORETYPES]; } @@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 12: return (&gotoblas_NEOVERSEN2); case 13: return (&gotoblas_THUNDERX3T110); case 14: return (&gotoblas_CORTEXA55); + case 15: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } +#ifndef NO_SVE + if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { + return &gotoblas_ARMV8SVE; + } +#endif + return NULL; #endif } diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c index 6aa44a8f6..2fdaf5fcd 100644 --- a/kernel/arm64/cgemm_ncopy_sve_v1.c +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c index 748cd954e..086a2fed1 100644 --- a/kernel/arm64/cgemm_tcopy_sve_v1.c +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index 6ba4afc8b..e138f0647 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 32da5bd16..9a4cb6d4f 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index ac4019e26..fdda992e0 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index 8fdcd0f4b..1a03aa974 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 0f5f0dccd..b06166f36 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index 8f9b4268a..6b8c93baf 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index c6e50bc1c..fd8d2190f 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 37dbfe4e1..615667264 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index 21e03b7be..f763d5607 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); #else @@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c index 6f18aa956..5a17d3b19 100644 --- a/kernel/arm64/zsymm_lcopy_sve.c +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c index 6be48cdaf..06989e3aa 100644 --- a/kernel/arm64/zsymm_ucopy_sve.c +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index d34f607ab..5a7171d9d 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index 7f34c9857..3a88f26b2 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index 7eb9452c9..c3dbdcbe3 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index 60c8ff3b4..ddfa7ba4e 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c index eb7cd0294..f81ba14c2 100644 --- a/kernel/arm64/ztrsm_lncopy_sve.c +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c index 34dbf8a30..46a11abed 100644 --- a/kernel/arm64/ztrsm_ltcopy_sve.c +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c index 92e086b75..436112130 100644 --- a/kernel/arm64/ztrsm_uncopy_sve.c +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c index ccb942e1b..ddf3e265f 100644 --- a/kernel/arm64/ztrsm_utcopy_sve.c +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/param.h b/param.h index aa193a284..84e0c2ac7 100644 --- a/param.h +++ b/param.h @@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEV1) +#elif defined(NEOVERSEV1) // 256-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) +#elif defined(A64FX) // 512-bit SVE /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16