fix zgemm kernel
This commit is contained in:
parent
6ec4aab875
commit
40b14e4957
|
@ -53,12 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define alphaR x19
|
#define alphaR x19
|
||||||
#define alphaI x20
|
#define alphaI x20
|
||||||
|
|
||||||
#define alphaz_R z10.d
|
#define alphaz_R z6.d
|
||||||
#define alphaz_I z11.d
|
#define alphaz_I z7.d
|
||||||
#define alpha0_R d10
|
#define alpha0_R d6
|
||||||
#define alphaV0_R v10.d[0]
|
#define alpha0_I d7
|
||||||
#define alpha0_I d11
|
|
||||||
#define alphaV0_I v11.d[0]
|
|
||||||
|
|
||||||
|
|
||||||
#define A_PRE_SIZE 2560
|
#define A_PRE_SIZE 2560
|
||||||
|
@ -170,8 +168,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro KERNELv1x4_I
|
.macro KERNELv1x4_I
|
||||||
ld2d {z0.d, z1.d}, p1/z, [pA]
|
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||||
ld2d {z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one
|
add pA, pA, lanes, lsl #4 // pA += lanes*2*8
|
||||||
add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8
|
ld2d {z2.d, z3.d}, p1/z, [pA] // next one
|
||||||
|
add pA, pA, lanes, lsl #4 // pA += lanes*2*8
|
||||||
|
|
||||||
ld1rd z8.d, p0/z, [pB]
|
ld1rd z8.d, p0/z, [pB]
|
||||||
ld1rd z9.d, p0/z, [pB, 8]
|
ld1rd z9.d, p0/z, [pB, 8]
|
||||||
|
@ -283,7 +282,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNELv1x4_M2
|
.macro KERNELv1x4_M2
|
||||||
ld2d {z2.d, z3.d}, p1/z, [pA]
|
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8
|
add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8
|
||||||
|
|
||||||
OP_rr z16.d, p1/m, z2.d, z8.d
|
OP_rr z16.d, p1/m, z2.d, z8.d
|
||||||
|
@ -396,39 +395,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||||
fmla z25.d, p1/m, z16.d, alphaz_I
|
fmla z25.d, p1/m, z16.d, alphaz_I
|
||||||
fmla z25.d, p1/m, z17.d, alphaz_R
|
fmla z25.d, p1/m, z17.d, alphaz_R
|
||||||
st2d {z25.d, z26.d}, p1, [pCRow0]
|
st2d {z24.d, z25.d}, p1, [pCRow0]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
add pCRow0, pCRow0, lanes, lsl #4
|
||||||
|
|
||||||
ld2d {z26.d, z27.d}, p1/z, [pCRow0]
|
ld2d {z26.d, z27.d}, p1/z, [pCRow1]
|
||||||
fmla z26.d, p1/m, z18.d, alphaz_R
|
fmla z26.d, p1/m, z18.d, alphaz_R
|
||||||
fmls z26.d, p1/m, z19.d, alphaz_I
|
fmls z26.d, p1/m, z19.d, alphaz_I
|
||||||
fmla z27.d, p1/m, z18.d, alphaz_I
|
fmla z27.d, p1/m, z18.d, alphaz_I
|
||||||
fmla z27.d, p1/m, z19.d, alphaz_R
|
fmla z27.d, p1/m, z19.d, alphaz_R
|
||||||
st2d {z26.d, z27.d}, p1, [pCRow0]
|
st2d {z26.d, z27.d}, p1, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
add pCRow1, pCRow1, lanes, lsl #4
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
|
||||||
ld2d {z28.d, z29.d}, p1/z, [pCRow1]
|
ld2d {z28.d, z29.d}, p1/z, [pCRow2]
|
||||||
fmla z28.d, p1/m, z20.d, alphaz_R
|
fmla z28.d, p1/m, z20.d, alphaz_R
|
||||||
fmls z28.d, p1/m, z21.d, alphaz_I
|
fmls z28.d, p1/m, z21.d, alphaz_I
|
||||||
fmla z29.d, p1/m, z20.d, alphaz_I
|
fmla z29.d, p1/m, z20.d, alphaz_I
|
||||||
fmla z29.d, p1/m, z21.d, alphaz_R
|
fmla z29.d, p1/m, z21.d, alphaz_R
|
||||||
st2d {z28.d, z29.d}, p1, [pCRow1]
|
st2d {z28.d, z29.d}, p1, [pCRow2]
|
||||||
|
|
||||||
add pCRow1, pCRow1, #32
|
add pCRow2, pCRow2, lanes, lsl #4
|
||||||
|
|
||||||
ld2d {z30.d, z31.d}, p1/z, [pCRow1]
|
ld2d {z30.d, z31.d}, p1/z, [pCRow3]
|
||||||
fmla z30.d, p1/m, z22.d, alphaz_R
|
fmla z30.d, p1/m, z22.d, alphaz_R
|
||||||
fmls z30.d, p1/m, z23.d, alphaz_I
|
fmls z30.d, p1/m, z23.d, alphaz_I
|
||||||
fmla z31.d, p1/m, z22.d, alphaz_I
|
fmla z31.d, p1/m, z22.d, alphaz_I
|
||||||
fmla z31.d, p1/m, z23.d, alphaz_R
|
fmla z31.d, p1/m, z23.d, alphaz_R
|
||||||
st2d {z30.d, z31.d}, p1, [pCRow1]
|
st2d {z30.d, z31.d}, p1, [pCRow3]
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||||
|
|
||||||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||||
|
|
||||||
|
@ -474,24 +473,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||||
fmla z25.d, p1/m, z16.d, alphaz_I
|
fmla z25.d, p1/m, z16.d, alphaz_I
|
||||||
fmla z25.d, p1/m, z17.d, alphaz_R
|
fmla z25.d, p1/m, z17.d, alphaz_R
|
||||||
st2d {z25.d, z26.d}, p1, [pCRow0]
|
st2d {z24.d, z25.d}, p1, [pCRow0]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
add pCRow0, pCRow0, lanes, lsl #4
|
||||||
|
|
||||||
ld2d {z26.d, z27.d}, p1/z, [pCRow0]
|
ld2d {z26.d, z27.d}, p1/z, [pCRow1]
|
||||||
fmla z26.d, p1/m, z18.d, alphaz_R
|
fmla z26.d, p1/m, z18.d, alphaz_R
|
||||||
fmls z26.d, p1/m, z19.d, alphaz_I
|
fmls z26.d, p1/m, z19.d, alphaz_I
|
||||||
fmla z27.d, p1/m, z18.d, alphaz_I
|
fmla z27.d, p1/m, z18.d, alphaz_I
|
||||||
fmla z27.d, p1/m, z19.d, alphaz_R
|
fmla z27.d, p1/m, z19.d, alphaz_R
|
||||||
st2d {z26.d, z27.d}, p1, [pCRow0]
|
st2d {z26.d, z27.d}, p1, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
add pCRow1, pCRow1, lanes, lsl #4
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||||
|
|
||||||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -526,10 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||||
fmla z25.d, p1/m, z16.d, alphaz_I
|
fmla z25.d, p1/m, z16.d, alphaz_I
|
||||||
fmla z25.d, p1/m, z17.d, alphaz_R
|
fmla z25.d, p1/m, z17.d, alphaz_R
|
||||||
st2d {z25.d, z26.d}, p1, [pCRow0]
|
st2d {z24.d, z25.d}, p1, [pCRow0]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
|
||||||
|
|
||||||
|
|
||||||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
||||||
|
|
||||||
|
@ -718,6 +712,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ble .Lzgemm_kernel_L1_BEGIN
|
ble .Lzgemm_kernel_L1_BEGIN
|
||||||
|
|
||||||
mov pCRow0, pC // pCRow0 = pC
|
mov pCRow0, pC // pCRow0 = pC
|
||||||
|
add pCRow1, pCRow0, LDC
|
||||||
|
|
||||||
add pC,pC,LDC, lsl #1
|
add pC,pC,LDC, lsl #1
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
|
svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
|
||||||
svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
|
svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
|
||||||
aoffset1 += 2;
|
aoffset1 += 2;
|
||||||
boffset += active;
|
boffset += active * 2;
|
||||||
}
|
}
|
||||||
aoffset += sve_size * lda * 2;
|
aoffset += sve_size * lda * 2;
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
aoffset1 += lda * 2;
|
aoffset1 += lda * 2;
|
||||||
boffset += active * 2;
|
boffset += active * 2;
|
||||||
}
|
}
|
||||||
aoffset += sve_size * 2;
|
aoffset += active * 2;
|
||||||
|
|
||||||
j += svcntd();
|
j += svcntd();
|
||||||
pg = svwhilelt_b64(j, n);
|
pg = svwhilelt_b64(j, n);
|
||||||
|
|
Loading…
Reference in New Issue