some clean-up & commentary

This commit is contained in:
Bine Brank 2021-11-21 14:56:27 +01:00
parent e6ed4be02e
commit b58d4f31ab
10 changed files with 104 additions and 107 deletions

View File

@ -143,7 +143,7 @@ endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c DGEMMINCOPY = dgemm_ncopy_sve_v1.c

View File

@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10 #define alpha0 d10
#define alphaZ z2.d #define alphaZ z2.d
#define A_PRE_SIZE 2560 #define A_PRE_SIZE 1536
#define B_PRE_SIZE 512 #define B_PRE_SIZE 512
#define C_PRE_SIZE 128 #define C_PRE_SIZE 128
@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I .macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
//incb pA, all, mul #2
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ptrue p0.d // create true predicate ptrue p0.d // create true predicate
mov pB, origPB mov pB, origPB
// Loop over N
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5 .align 5
.Ldgemm_kernel_L8_BEGIN: .Ldgemm_kernel_L8_BEGIN:
@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.Ldgemm_kernel_L8_Mv1_BEGIN: .Ldgemm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0 mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
.align 5 .align 5
@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bgt .Ldgemm_kernel_L8_BEGIN bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 4 left in N */
.align 5 .align 5
.Ldgemm_kernel_L4_BEGIN: .Ldgemm_kernel_L4_BEGIN:
@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 2 left in N */
.align 5 .align 5
.Ldgemm_kernel_L2_BEGIN: .Ldgemm_kernel_L2_BEGIN:
@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 1 left in N */
.align 5 .align 5
.Ldgemm_kernel_L1_BEGIN: .Ldgemm_kernel_L1_BEGIN:

View File

@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/ *******************************************************************************/
/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
This means that we sweep two panels of packed A when iterating in a loop over K.
With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10 #define alpha0 d10
#define alphaZ z7.d #define alphaZ z7.d
#define A_PRE_SIZE 2560 #define A_PRE_SIZE 1536
#define B_PRE_SIZE 512 #define B_PRE_SIZE 512
#define C_PRE_SIZE 128 #define C_PRE_SIZE 128
@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v00 ALPHA -> pA10_0 //v00 ALPHA -> pA10_0
//v01 pA10_1 //v01 pA10_1
//v02 //v02 pA20_0
//v03 //v03 pA20_1
//v04 //v04
//v05 //v05
//v06 //v06
@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v21 must save C5 //v21 must save C5
//v22 must save C6 //v22 must save C6
//v23 must save C7 //v23 must save C7
//v24 must save C8
//v25 must save C9
//v26 must save C10
//v27 must save C11
//v28 must save C12
//v29 must save C13
//v30 must save C14
//v31 must save C15
/******************************************************************************* /*******************************************************************************
* Macro definitions * Macro definitions
@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I .macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA1] ld1d z0.d, p1/z, [pA1]
ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one
//incb pA1, all, mul #2
add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ptrue p0.d // create true predicate ptrue p0.d // create true predicate
mov pB, origPB mov pB, origPB
// Loop over N
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5 .align 5
.Ldgemm_kernel_L8_BEGIN: .Ldgemm_kernel_L8_BEGIN:
@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.Ldgemm_kernel_L8_Mv2_BEGIN: .Ldgemm_kernel_L8_Mv2_BEGIN:
mov counterI, #0 mov counterI, #0
cmp origM, vec_lenx2 cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN
blt .Ldgemm_kernel_L8_Mv1_BEGIN blt .Ldgemm_kernel_L8_Mv1_BEGIN
mov counterI, origM mov counterI, origM
/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
mul temp, vec_len, origK // generate address of pA2 mul temp, vec_len, origK // generate address of pA2
add pA2, pA1, temp, lsl #3 // pA1 = start of A array add pA2, pA1, temp, lsl #3 // pA1 = start of A array
prfm PLDL1KEEP, [pA2] prfm PLDL1KEEP, [pA2]
@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp counterI, origM cmp counterI, origM
beq .Ldgemm_kernel_L8_END beq .Ldgemm_kernel_L8_END
////////////////////////////////// //////////////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
.Ldgemm_kernel_L8_Mv1_BEGIN: .Ldgemm_kernel_L8_Mv1_BEGIN:
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bgt .Ldgemm_kernel_L8_BEGIN bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 4 left in N */
.align 5 .align 5
.Ldgemm_kernel_L4_BEGIN: .Ldgemm_kernel_L4_BEGIN:
@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq .Ldgemm_kernel_L4_END beq .Ldgemm_kernel_L4_END
////////////////////////////////// //////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
.Ldgemm_kernel_L4_Mv1_BEGIN: .Ldgemm_kernel_L4_Mv1_BEGIN:
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 2 left in N */
.align 5 .align 5
.Ldgemm_kernel_L2_BEGIN: .Ldgemm_kernel_L2_BEGIN:
@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
////////////////////////////////// //////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
.Ldgemm_kernel_L2_Mv1_BEGIN: .Ldgemm_kernel_L2_Mv1_BEGIN:
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 1 left in N */
.align 5 .align 5
.Ldgemm_kernel_L1_BEGIN: .Ldgemm_kernel_L1_BEGIN:
@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
////////////////////////////////// //////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
.Ldgemm_kernel_L1_Mv1_BEGIN: .Ldgemm_kernel_L1_Mv1_BEGIN:
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction

View File

@ -40,40 +40,40 @@
#include "common.h" #include "common.h"
#include <arm_sve.h> #include <arm_sve.h>
// TODO: write in assembly with proper unrolling // TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j; BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset; IFLOAT *aoffset, *aoffset1, *boffset;
svint64_t lda_vec = svindex_s64(0LL, lda); svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd(); uint64_t sve_size = svcntd();
aoffset = a; aoffset = a;
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg); uint64_t active = svcntp_b64(svptrue_b64(), pg);
do { do {
aoffset1 = aoffset; aoffset1 = aoffset;
uint64_t i_cnt = m; uint64_t i_cnt = m;
while (i_cnt--) { while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec); svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++; aoffset1++;
boffset += active; boffset += active;
} }
aoffset += sve_size * lda; aoffset += sve_size * lda;
j += svcntd(); j += svcntd();
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
return 0; return 0;
} }

View File

@ -40,38 +40,38 @@
#include "common.h" #include "common.h"
#include <arm_sve.h> #include <arm_sve.h>
// TODO: write in assembly with proper unrolling // TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j; BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset; IFLOAT *aoffset, *aoffset1, *boffset;
uint64_t sve_size = svcntd(); uint64_t sve_size = svcntd();
aoffset = a; aoffset = a;
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg); uint64_t active = svcntp_b64(svptrue_b64(), pg);
do { do {
aoffset1 = aoffset; aoffset1 = aoffset;
uint64_t i_cnt = m; uint64_t i_cnt = m;
while (i_cnt--) { while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1); svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec); svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda; aoffset1 += lda;
boffset += active; boffset += active;
} }
aoffset += sve_size; aoffset += sve_size;
j += svcntd(); j += svcntd();
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
return 0; return 0;
} }

View File

@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10 #define alpha0 d10
#define alphaZ z2.d #define alphaZ z2.d
#define A_PRE_SIZE 2560 #define A_PRE_SIZE 1536
#define B_PRE_SIZE 512 #define B_PRE_SIZE 512
#define C_PRE_SIZE 128 #define C_PRE_SIZE 128
@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I .macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
//incb pA, all, mul #2
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
mov pB, origPB mov pB, origPB
// Loop over N
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble .Ldtrmm_kernel_L4_BEGIN ble .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5 .align 5
.Ldtrmm_kernel_L8_BEGIN: .Ldtrmm_kernel_L8_BEGIN:
@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.Ldtrmm_kernel_L8_Mv1_BEGIN: .Ldtrmm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0 mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
.align 5 .align 5
.Ldtrmm_kernel_L8_Mv1_20: .Ldtrmm_kernel_L8_Mv1_20:
@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bgt .Ldtrmm_kernel_L8_BEGIN bgt .Ldtrmm_kernel_L8_BEGIN
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 4 left in N */
.align 5 .align 5
.Ldtrmm_kernel_L4_BEGIN: .Ldtrmm_kernel_L4_BEGIN:
@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 2 left in N */
.align 5 .align 5
.Ldtrmm_kernel_L2_BEGIN: .Ldtrmm_kernel_L2_BEGIN:
@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /* Repeat the same thing if 1 left in N */
.align 5 .align 5
.Ldtrmm_kernel_L1_BEGIN: .Ldtrmm_kernel_L1_BEGIN:

View File

@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
BLASLONG i, js; BLASLONG i, js;
BLASLONG X; BLASLONG X;
//printf("Using trmm_ln.\n");
int sve_len = svcntd(); int sve_len = svcntd();
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
i = 0; i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do do
{ {
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl if (X > posY) {
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
svst1(pn, b, aj_vec); svst1(pn, b, aj_vec);
ao ++; ao ++;
@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++; X ++;
i ++; i ++;
} else { } else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT #ifdef UNIT
int temp = 0; int temp = 0;
for (int j = 0; j < n_active; j++) { for (int j = 0; j < n_active; j++) {
@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
} while (i < m); } while (i < m);
//printf("\n");
posY += n_active; posY += n_active;
js += n_active; js += n_active;
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64(js, n);

View File

@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
BLASLONG i, js; BLASLONG i, js;
BLASLONG X; BLASLONG X;
//printf("Using trmm_lt.\n");
int sve_len = svcntd(); int sve_len = svcntd();
FLOAT *ao; FLOAT *ao;
@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
i = 0; i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do do
{ {
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl if (X > posY) {
ao ++; ao ++;
b += n_active; b += n_active;
X ++; X ++;
@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++; X ++;
i ++; i ++;
} else { } else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT #ifdef UNIT
int temp = 0; int temp = 0;
for (int j = 0; j < n_active; j++) { for (int j = 0; j < n_active; j++) {
@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
} while (i < m); } while (i < m);
//printf("\n");
posY += n_active; posY += n_active;
js += n_active; js += n_active;

View File

@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
BLASLONG i, js; BLASLONG i, js;
BLASLONG X; BLASLONG X;
//printf("Using trmm_un.\n");
//printf("Using m %ld, n %ld.\n", m, n);
//printf("Using lda %ld.\n", lda);
//printf("Using posX %ld, posY %ld.\n", posX, posY);
int sve_len = svcntd(); int sve_len = svcntd();
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
i = 0; i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do do
{ {
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl if (X < posY) {
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
svst1(pn, b, aj_vec); svst1(pn, b, aj_vec);
ao ++; ao ++;
@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++; X ++;
i ++; i ++;
} else { } else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT #ifdef UNIT
int temp = 0; int temp = 0;
for (int j = 0; j < n_active; j++) { for (int j = 0; j < n_active; j++) {
@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
} while (i < m); } while (i < m);
//printf("\n");
posY += n_active; posY += n_active;
js += n_active; js += n_active;
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64(js, n);

View File

@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
BLASLONG i, js; BLASLONG i, js;
BLASLONG X; BLASLONG X;
//printf("Using trmm_ut.\n");
int sve_len = svcntd(); int sve_len = svcntd();
@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
i = 0; i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do do
{ {
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl if (X < posY) {
ao ++; ao ++;
b += n_active; b += n_active;
X ++; X ++;
@ -84,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++; X ++;
i ++; i ++;
} else { } else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT #ifdef UNIT
int temp = 0; int temp = 0;
for (int j = 0; j < n_active; j++) { for (int j = 0; j < n_active; j++) {
@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
} while (i < m); } while (i < m);
//printf("\n");
posY += n_active; posY += n_active;
js += n_active; js += n_active;
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64(js, n);