From 0de36f7b5ceea1c410ed98e62fd4748e9cc9324d Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 29 Nov 2021 21:25:05 +0100 Subject: [PATCH] trmm sve copy fucntions for single precision --- kernel/arm64/trmm_lncopy_sve_v1.c | 21 ++++++++++++++++++--- kernel/arm64/trmm_ltcopy_sve_v1.c | 15 +++++++++++++++ kernel/arm64/trmm_uncopy_sve_v1.c | 21 ++++++++++++++++++--- kernel/arm64/trmm_utcopy_sve_v1.c | 15 +++++++++++++++ 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index fc1b61325..918e945ac 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - svint64_t index = svindex_s64(0LL, lda); - - FLOAT *ao; js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X > posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); ao ++; b += n_active; @@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; } diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 14c6762d2..b76cc56de 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; +#ifdef DOUBLE svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X < posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); ao += lda; b += n_active; @@ -112,9 +121,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index b8344d474..75fa163ae 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - svint64_t index = svindex_s64(0LL, lda); - - FLOAT *ao; js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); ao ++; b += n_active; @@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; } diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 9be1c0abb..36a03242a 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; +#ifdef DOUBLE svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X > posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); ao += lda; b += n_active; @@ -111,9 +120,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; }