Merge pull request #4160 from Mousius/sve-sniff

Add ARMV8SVE to AArch64 Dynamic Dispatch
This commit is contained in:
Martin Kroeker 2023-07-26 13:46:16 +02:00 committed by GitHub
commit b1f6c4a1e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 160 additions and 84 deletions

View File

@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1
ifneq ($(NO_SVE), 1) ifneq ($(NO_SVE), 1)
DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEV1
DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += ARMV8SVE
endif endif
DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR DYNAMIC_CORE += FALKOR

View File

@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
if (ARM64) if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
endif () endif ()
if (DYNAMIC_LIST) if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
#else #else
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#endif #endif
#ifdef DYN_ARMV8SVE
extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55 #ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55; extern gotoblas_t gotoblas_CORTEXA55;
#else #else
@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
#ifndef NO_SVE #ifndef NO_SVE
extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
#else #else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif #endif
extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55; extern gotoblas_t gotoblas_CORTEXA55;
@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1 #define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
#define NUM_CORETYPES 13 #define NUM_CORETYPES 16
/* /*
* In case asm/hwcap.h is outdated on the build system, make sure * In case asm/hwcap.h is outdated on the build system, make sure
@ -173,6 +181,7 @@ static char *corename[] = {
"neoversen2", "neoversen2",
"thunderx3t110", "thunderx3t110",
"cortexa55", "cortexa55",
"armv8sve",
"unknown" "unknown"
}; };
@ -192,6 +201,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
return corename[NUM_CORETYPES]; return corename[NUM_CORETYPES];
} }
@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 12: return (&gotoblas_NEOVERSEN2); case 12: return (&gotoblas_NEOVERSEN2);
case 13: return (&gotoblas_THUNDERX3T110); case 13: return (&gotoblas_THUNDERX3T110);
case 14: return (&gotoblas_CORTEXA55); case 14: return (&gotoblas_CORTEXA55);
case 15: return (&gotoblas_ARMV8SVE);
} }
snprintf(message, 128, "Core not found: %s\n", coretype); snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message); openblas_warning(1, message);
@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg); openblas_warning(1, coremsg);
} }
#ifndef NO_SVE
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
return &gotoblas_ARMV8SVE;
}
#endif
return NULL; return NULL;
#endif #endif
} }

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b32(j, n); svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg); uint32_t active = svcntp_b32(svptrue_b32(), pg);
do { do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2; aoffset += active * lda * 2;
j += svcntw(); j += svcntw();
pg = svwhilelt_b32(j, n); pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b32(j, n); svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg); uint32_t active = svcntp_b32(svptrue_b32(), pg);
do { do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2; aoffset += active * 2;
j += svcntw(); j += svcntw();
pg = svwhilelt_b32(j, n); pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
uint64_t active = svcntp_b64(svptrue_b64(), pg); uint64_t active = svcntp_b64(svptrue_b64(), pg);
do { do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2; aoffset += active * lda * 2;
j += svcntd(); j += svcntd();
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
uint64_t active = svcntp_b64(svptrue_b64(), pg); uint64_t active = svcntp_b64(svptrue_b64(), pg);
do { do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2; aoffset += active * 2;
j += svcntd(); j += svcntd();
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b64(offset, 0LL); svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t j = 0; int32_t j = 0;
int32_t N = n; int32_t N = n;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b32(offset, 0); svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
data_vec_imag = svneg_z(pg, data_vec_imag); data_vec_imag = svneg_z(pg, data_vec_imag);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b64(offset, 0LL); svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
#else #else
@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t j = 0; int32_t j = 0;
int32_t N = n; int32_t N = n;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
data_vec_imag = svneg_z(pg, data_vec_imag); data_vec_imag = svneg_z(pg, data_vec_imag);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b32(offset, 0); svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
#ifdef DOUBLE #ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
js = 0; js = 0;
#ifdef DOUBLE #ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
#ifdef DOUBLE #ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
js = 0; js = 0;
#ifdef DOUBLE #ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

41
param.h
View File

@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096
#elif defined(NEOVERSEV1) #elif defined(NEOVERSEV1) // 256-bit SVE
#if defined(XDOUBLE) || defined(DOUBLE) #if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8 #define SWITCH_RATIO 8
@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096
#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) #elif defined(A64FX) // 512-bit SVE
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
#define CGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096
#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif
#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_MN 16
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_MN 16
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#else /* Other/undetected ARMv8 cores */ #else /* Other/undetected ARMv8 cores */
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16