some clean-up & commentary
This commit is contained in:
parent
e6ed4be02e
commit
b58d4f31ab
|
@ -143,7 +143,7 @@ endif
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||||
|
|
|
@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define alpha0 d10
|
#define alpha0 d10
|
||||||
#define alphaZ z2.d
|
#define alphaZ z2.d
|
||||||
|
|
||||||
#define A_PRE_SIZE 2560
|
#define A_PRE_SIZE 1536
|
||||||
#define B_PRE_SIZE 512
|
#define B_PRE_SIZE 512
|
||||||
#define C_PRE_SIZE 128
|
#define C_PRE_SIZE 128
|
||||||
|
|
||||||
|
@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro KERNELv1x8_I
|
.macro KERNELv1x8_I
|
||||||
ld1d z0.d, p1/z, [pA]
|
ld1d z0.d, p1/z, [pA]
|
||||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
||||||
//incb pA, all, mul #2
|
|
||||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
||||||
|
|
||||||
ld1rd z8.d, p0/z, [pB]
|
ld1rd z8.d, p0/z, [pB]
|
||||||
|
@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ptrue p0.d // create true predicate
|
ptrue p0.d // create true predicate
|
||||||
|
|
||||||
mov pB, origPB
|
mov pB, origPB
|
||||||
|
// Loop over N
|
||||||
mov counterJ, origN
|
mov counterJ, origN
|
||||||
asr counterJ, counterJ, #3 // J = J / 8
|
asr counterJ, counterJ, #3 // J = J / 8
|
||||||
cmp counterJ, #0
|
cmp counterJ, #0
|
||||||
ble .Ldgemm_kernel_L4_BEGIN
|
ble .Ldgemm_kernel_L4_BEGIN
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
/* Repeat this as long as there are 8 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L8_BEGIN:
|
.Ldgemm_kernel_L8_BEGIN:
|
||||||
|
@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
||||||
|
|
||||||
|
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||||
mov counterI, #0
|
mov counterI, #0
|
||||||
whilelt p1.d, counterI, origM //SVE instruction
|
whilelt p1.d, counterI, origM
|
||||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
|
@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
bgt .Ldgemm_kernel_L8_BEGIN
|
bgt .Ldgemm_kernel_L8_BEGIN
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 4 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L4_BEGIN:
|
.Ldgemm_kernel_L4_BEGIN:
|
||||||
|
@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
add origPB, origPB, temp // B = B + K * 4 * 8
|
add origPB, origPB, temp // B = B + K * 4 * 8
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 2 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L2_BEGIN:
|
.Ldgemm_kernel_L2_BEGIN:
|
||||||
|
@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
|
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 1 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L1_BEGIN:
|
.Ldgemm_kernel_L1_BEGIN:
|
||||||
|
|
|
@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*******************************************************************************/
|
*******************************************************************************/
|
||||||
|
|
||||||
|
/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
|
||||||
|
However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
|
||||||
|
This means that we sweep two panels of packed A when iterating in a loop over K.
|
||||||
|
With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */
|
||||||
|
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define alpha0 d10
|
#define alpha0 d10
|
||||||
#define alphaZ z7.d
|
#define alphaZ z7.d
|
||||||
|
|
||||||
#define A_PRE_SIZE 2560
|
#define A_PRE_SIZE 1536
|
||||||
#define B_PRE_SIZE 512
|
#define B_PRE_SIZE 512
|
||||||
#define C_PRE_SIZE 128
|
#define C_PRE_SIZE 128
|
||||||
|
|
||||||
|
@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
//v00 ALPHA -> pA10_0
|
//v00 ALPHA -> pA10_0
|
||||||
//v01 pA10_1
|
//v01 pA10_1
|
||||||
//v02
|
//v02 pA20_0
|
||||||
//v03
|
//v03 pA20_1
|
||||||
//v04
|
//v04
|
||||||
//v05
|
//v05
|
||||||
//v06
|
//v06
|
||||||
|
@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
//v21 must save C5
|
//v21 must save C5
|
||||||
//v22 must save C6
|
//v22 must save C6
|
||||||
//v23 must save C7
|
//v23 must save C7
|
||||||
|
//v24 must save C8
|
||||||
|
//v25 must save C9
|
||||||
|
//v26 must save C10
|
||||||
|
//v27 must save C11
|
||||||
|
//v28 must save C12
|
||||||
|
//v29 must save C13
|
||||||
|
//v30 must save C14
|
||||||
|
//v31 must save C15
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
* Macro definitions
|
* Macro definitions
|
||||||
|
@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro KERNELv1x8_I
|
.macro KERNELv1x8_I
|
||||||
ld1d z0.d, p1/z, [pA1]
|
ld1d z0.d, p1/z, [pA1]
|
||||||
ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one
|
ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one
|
||||||
//incb pA1, all, mul #2
|
|
||||||
add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8
|
add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8
|
||||||
|
|
||||||
ld1rd z8.d, p0/z, [pB]
|
ld1rd z8.d, p0/z, [pB]
|
||||||
|
@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ptrue p0.d // create true predicate
|
ptrue p0.d // create true predicate
|
||||||
|
|
||||||
mov pB, origPB
|
mov pB, origPB
|
||||||
|
// Loop over N
|
||||||
mov counterJ, origN
|
mov counterJ, origN
|
||||||
asr counterJ, counterJ, #3 // J = J / 8
|
asr counterJ, counterJ, #3 // J = J / 8
|
||||||
cmp counterJ, #0
|
cmp counterJ, #0
|
||||||
ble .Ldgemm_kernel_L4_BEGIN
|
ble .Ldgemm_kernel_L4_BEGIN
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
/* Repeat this as long as there are 8 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L8_BEGIN:
|
.Ldgemm_kernel_L8_BEGIN:
|
||||||
|
@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.Ldgemm_kernel_L8_Mv2_BEGIN:
|
.Ldgemm_kernel_L8_Mv2_BEGIN:
|
||||||
|
|
||||||
mov counterI, #0
|
mov counterI, #0
|
||||||
cmp origM, vec_lenx2
|
cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN
|
||||||
blt .Ldgemm_kernel_L8_Mv1_BEGIN
|
blt .Ldgemm_kernel_L8_Mv1_BEGIN
|
||||||
|
|
||||||
mov counterI, origM
|
mov counterI, origM
|
||||||
|
|
||||||
|
/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
|
||||||
mul temp, vec_len, origK // generate address of pA2
|
mul temp, vec_len, origK // generate address of pA2
|
||||||
add pA2, pA1, temp, lsl #3 // pA1 = start of A array
|
add pA2, pA1, temp, lsl #3 // pA1 = start of A array
|
||||||
prfm PLDL1KEEP, [pA2]
|
prfm PLDL1KEEP, [pA2]
|
||||||
|
@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
cmp counterI, origM
|
cmp counterI, origM
|
||||||
beq .Ldgemm_kernel_L8_END
|
beq .Ldgemm_kernel_L8_END
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////////////
|
||||||
|
// We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
|
||||||
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
||||||
|
|
||||||
whilelt p1.d, counterI, origM //SVE instruction
|
whilelt p1.d, counterI, origM //SVE instruction
|
||||||
|
@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
bgt .Ldgemm_kernel_L8_BEGIN
|
bgt .Ldgemm_kernel_L8_BEGIN
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 4 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L4_BEGIN:
|
.Ldgemm_kernel_L4_BEGIN:
|
||||||
|
@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
beq .Ldgemm_kernel_L4_END
|
beq .Ldgemm_kernel_L4_END
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
// We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
|
||||||
.Ldgemm_kernel_L4_Mv1_BEGIN:
|
.Ldgemm_kernel_L4_Mv1_BEGIN:
|
||||||
|
|
||||||
whilelt p1.d, counterI, origM //SVE instruction
|
whilelt p1.d, counterI, origM //SVE instruction
|
||||||
|
@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
add origPB, origPB, temp // B = B + K * 4 * 8
|
add origPB, origPB, temp // B = B + K * 4 * 8
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 2 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L2_BEGIN:
|
.Ldgemm_kernel_L2_BEGIN:
|
||||||
|
@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
// We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
|
||||||
.Ldgemm_kernel_L2_Mv1_BEGIN:
|
.Ldgemm_kernel_L2_Mv1_BEGIN:
|
||||||
|
|
||||||
whilelt p1.d, counterI, origM //SVE instruction
|
whilelt p1.d, counterI, origM //SVE instruction
|
||||||
|
@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
|
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 1 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldgemm_kernel_L1_BEGIN:
|
.Ldgemm_kernel_L1_BEGIN:
|
||||||
|
@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
// We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
|
||||||
.Ldgemm_kernel_L1_Mv1_BEGIN:
|
.Ldgemm_kernel_L1_Mv1_BEGIN:
|
||||||
|
|
||||||
whilelt p1.d, counterI, origM //SVE instruction
|
whilelt p1.d, counterI, origM //SVE instruction
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <arm_sve.h>
|
#include <arm_sve.h>
|
||||||
|
|
||||||
// TODO: write in assembly with proper unrolling
|
// TODO: write in assembly with proper unrolling of inner loop
|
||||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
|
||||||
BLASLONG j;
|
BLASLONG j;
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <arm_sve.h>
|
#include <arm_sve.h>
|
||||||
|
|
||||||
// TODO: write in assembly with proper unrolling
|
// TODO: write in assembly with proper unrolling of inner loop
|
||||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
|
||||||
BLASLONG j;
|
BLASLONG j;
|
||||||
|
|
|
@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define alpha0 d10
|
#define alpha0 d10
|
||||||
#define alphaZ z2.d
|
#define alphaZ z2.d
|
||||||
|
|
||||||
#define A_PRE_SIZE 2560
|
#define A_PRE_SIZE 1536
|
||||||
#define B_PRE_SIZE 512
|
#define B_PRE_SIZE 512
|
||||||
#define C_PRE_SIZE 128
|
#define C_PRE_SIZE 128
|
||||||
|
|
||||||
|
@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro KERNELv1x8_I
|
.macro KERNELv1x8_I
|
||||||
ld1d z0.d, p1/z, [pA]
|
ld1d z0.d, p1/z, [pA]
|
||||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
||||||
//incb pA, all, mul #2
|
|
||||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
||||||
|
|
||||||
ld1rd z8.d, p0/z, [pB]
|
ld1rd z8.d, p0/z, [pB]
|
||||||
|
@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
mov pB, origPB
|
mov pB, origPB
|
||||||
|
// Loop over N
|
||||||
mov counterJ, origN
|
mov counterJ, origN
|
||||||
asr counterJ, counterJ, #3 // J = J / 8
|
asr counterJ, counterJ, #3 // J = J / 8
|
||||||
cmp counterJ, #0
|
cmp counterJ, #0
|
||||||
ble .Ldtrmm_kernel_L4_BEGIN
|
ble .Ldtrmm_kernel_L4_BEGIN
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
/* Repeat this as long as there are 8 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldtrmm_kernel_L8_BEGIN:
|
.Ldtrmm_kernel_L8_BEGIN:
|
||||||
|
@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.Ldtrmm_kernel_L8_Mv1_BEGIN:
|
.Ldtrmm_kernel_L8_Mv1_BEGIN:
|
||||||
|
|
||||||
|
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||||
mov counterI, #0
|
mov counterI, #0
|
||||||
whilelt p1.d, counterI, origM //SVE instruction
|
whilelt p1.d, counterI, origM
|
||||||
cntp lanes, p0, p1.d
|
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldtrmm_kernel_L8_Mv1_20:
|
.Ldtrmm_kernel_L8_Mv1_20:
|
||||||
|
@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
bgt .Ldtrmm_kernel_L8_BEGIN
|
bgt .Ldtrmm_kernel_L8_BEGIN
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 4 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldtrmm_kernel_L4_BEGIN:
|
.Ldtrmm_kernel_L4_BEGIN:
|
||||||
|
@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 2 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldtrmm_kernel_L2_BEGIN:
|
.Ldtrmm_kernel_L2_BEGIN:
|
||||||
|
@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
/******************************************************************************/
|
/* Repeat the same thing if 1 left in N */
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldtrmm_kernel_L1_BEGIN:
|
.Ldtrmm_kernel_L1_BEGIN:
|
||||||
|
|
|
@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
|
|
||||||
BLASLONG i, js;
|
BLASLONG i, js;
|
||||||
BLASLONG X;
|
BLASLONG X;
|
||||||
//printf("Using trmm_ln.\n");
|
|
||||||
|
|
||||||
int sve_len = svcntd();
|
int sve_len = svcntd();
|
||||||
svint64_t index = svindex_s64(0LL, lda);
|
svint64_t index = svindex_s64(0LL, lda);
|
||||||
|
@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
/* svbool_t pm = svwhilelt_b64(i, m); */
|
|
||||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
|
if (X > posY) {
|
||||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||||
svst1(pn, b, aj_vec);
|
svst1(pn, b, aj_vec);
|
||||||
ao ++;
|
ao ++;
|
||||||
|
@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
X ++;
|
X ++;
|
||||||
i ++;
|
i ++;
|
||||||
} else {
|
} else {
|
||||||
|
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
int temp = 0;
|
int temp = 0;
|
||||||
for (int j = 0; j < n_active; j++) {
|
for (int j = 0; j < n_active; j++) {
|
||||||
|
@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
} while (i < m);
|
} while (i < m);
|
||||||
|
|
||||||
//printf("\n");
|
|
||||||
|
|
||||||
|
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
pn = svwhilelt_b64(js, n);
|
pn = svwhilelt_b64(js, n);
|
||||||
|
|
|
@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
BLASLONG i, js;
|
BLASLONG i, js;
|
||||||
BLASLONG X;
|
BLASLONG X;
|
||||||
|
|
||||||
//printf("Using trmm_lt.\n");
|
|
||||||
|
|
||||||
int sve_len = svcntd();
|
int sve_len = svcntd();
|
||||||
|
|
||||||
FLOAT *ao;
|
FLOAT *ao;
|
||||||
|
@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
/* svbool_t pm = svwhilelt_b64(i, m); */
|
|
||||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
|
if (X > posY) {
|
||||||
ao ++;
|
ao ++;
|
||||||
b += n_active;
|
b += n_active;
|
||||||
X ++;
|
X ++;
|
||||||
|
@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
X ++;
|
X ++;
|
||||||
i ++;
|
i ++;
|
||||||
} else {
|
} else {
|
||||||
|
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
int temp = 0;
|
int temp = 0;
|
||||||
for (int j = 0; j < n_active; j++) {
|
for (int j = 0; j < n_active; j++) {
|
||||||
|
@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
} while (i < m);
|
} while (i < m);
|
||||||
|
|
||||||
//printf("\n");
|
|
||||||
|
|
||||||
|
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
|
|
|
@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
|
|
||||||
BLASLONG i, js;
|
BLASLONG i, js;
|
||||||
BLASLONG X;
|
BLASLONG X;
|
||||||
//printf("Using trmm_un.\n");
|
|
||||||
//printf("Using m %ld, n %ld.\n", m, n);
|
|
||||||
//printf("Using lda %ld.\n", lda);
|
|
||||||
//printf("Using posX %ld, posY %ld.\n", posX, posY);
|
|
||||||
|
|
||||||
int sve_len = svcntd();
|
int sve_len = svcntd();
|
||||||
svint64_t index = svindex_s64(0LL, lda);
|
svint64_t index = svindex_s64(0LL, lda);
|
||||||
|
@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
/* svbool_t pm = svwhilelt_b64(i, m); */
|
|
||||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
|
if (X < posY) {
|
||||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||||
svst1(pn, b, aj_vec);
|
svst1(pn, b, aj_vec);
|
||||||
ao ++;
|
ao ++;
|
||||||
|
@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
X ++;
|
X ++;
|
||||||
i ++;
|
i ++;
|
||||||
} else {
|
} else {
|
||||||
|
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
int temp = 0;
|
int temp = 0;
|
||||||
for (int j = 0; j < n_active; j++) {
|
for (int j = 0; j < n_active; j++) {
|
||||||
|
@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
} while (i < m);
|
} while (i < m);
|
||||||
|
|
||||||
//printf("\n");
|
|
||||||
|
|
||||||
|
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
pn = svwhilelt_b64(js, n);
|
pn = svwhilelt_b64(js, n);
|
||||||
|
|
|
@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
|
|
||||||
BLASLONG i, js;
|
BLASLONG i, js;
|
||||||
BLASLONG X;
|
BLASLONG X;
|
||||||
//printf("Using trmm_ut.\n");
|
|
||||||
|
|
||||||
int sve_len = svcntd();
|
int sve_len = svcntd();
|
||||||
|
|
||||||
|
@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
/* svbool_t pm = svwhilelt_b64(i, m); */
|
|
||||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
|
if (X < posY) {
|
||||||
ao ++;
|
ao ++;
|
||||||
b += n_active;
|
b += n_active;
|
||||||
X ++;
|
X ++;
|
||||||
|
@ -84,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
X ++;
|
X ++;
|
||||||
i ++;
|
i ++;
|
||||||
} else {
|
} else {
|
||||||
|
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
int temp = 0;
|
int temp = 0;
|
||||||
for (int j = 0; j < n_active; j++) {
|
for (int j = 0; j < n_active; j++) {
|
||||||
|
@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
} while (i < m);
|
} while (i < m);
|
||||||
|
|
||||||
//printf("\n");
|
|
||||||
|
|
||||||
|
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
pn = svwhilelt_b64(js, n);
|
pn = svwhilelt_b64(js, n);
|
||||||
|
|
Loading…
Reference in New Issue