From 68c414d3a6d9af7f8a686868feeddcd237977b05 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 4 Jan 2022 14:40:59 +0100 Subject: [PATCH] ztrmm sve copy functions --- kernel/arm64/ztrmm_lncopy_sve_v1.c | 14 +++++++------- kernel/arm64/ztrmm_ltcopy_sve_v1.c | 12 ++++++------ kernel/arm64/ztrmm_uncopy_sve_v1.c | 14 +++++++------- kernel/arm64/ztrmm_utcopy_sve_v1.c | 12 ++++++------ 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index 19c34ff41..d34f607ab 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = 0; FLOAT *ao; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda*2); + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda*2); + svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X < posY) { - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -99,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k < j; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } b[temp++] = ONE; b[temp++] = ZERO; @@ -113,8 +113,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k <= j; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } for (int k = j+1; k < n_active; k++) { b[temp++] = ZERO; diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index c272db602..7f34c9857 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svfloat32x2_t aj_vec = svld2(pn, ao); #endif svst2(pn, b, aj_vec); - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -101,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ONE; b[temp++] = ZERO; for (int k = j+1; k < n_active; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } } #else @@ -113,12 +113,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ZERO; } for (int k = j; k < n_active; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } } #endif - ao += n_active * lda * 2; + ao += n_active * lda; b += n_active*n_active * 2; X += n_active; i += n_active; diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index aaa217063..7eb9452c9 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = 0; FLOAT *ao; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda * 2); + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda * 2); + svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X > posY) { - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -105,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ONE; b[temp++] = ZERO; for (int k = j+1; k < n_active; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } } #else @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ZERO; } for (int k = j; k < n_active; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } } #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index c3e1f1b42..60c8ff3b4 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svfloat32x2_t aj_vec = svld2(pn, ao); #endif svst2(pn, b, aj_vec); - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -95,8 +95,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k < j; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } b[temp++] = ONE; b[temp++] = ZERO; @@ -109,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k <= j; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } for (int k = j+1; k < n_active; k++) { b[temp++] = ZERO; @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } #endif - ao += n_active * lda * 2; + ao += n_active * lda; b += n_active*n_active * 2; X += n_active; i += n_active;