add v2x8 kernel + fix sve dtrmm
This commit is contained in:
parent
7093372e32
commit
ab7917910d
|
@ -143,34 +143,22 @@ endif
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
|
||||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
|
||||||
|
|
||||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||||
|
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||||
else
|
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
|
||||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
|
|
||||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
|
||||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
|
||||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
|
||||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
|
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -344,21 +344,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
|
||||||
add pCRow1, pCRow0, LDC
|
add pCRow1, pCRow0, LDC
|
||||||
fmla z16.d, p1/m, z16.d, alphaZ
|
fmul z16.d, p1/m, z16.d, alphaZ
|
||||||
st1d z16.d, p1, [pCRow0]
|
st1d z16.d, p1, [pCRow0]
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
|
||||||
add pCRow2, pCRow1, LDC
|
add pCRow2, pCRow1, LDC
|
||||||
fmla z17.d, p1/m, z17.d, alphaZ
|
fmul z17.d, p1/m, z17.d, alphaZ
|
||||||
st1d z17.d, p1, [pCRow1]
|
st1d z17.d, p1, [pCRow1]
|
||||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||||
|
|
||||||
add pCRow1, pCRow2, LDC
|
add pCRow1, pCRow2, LDC
|
||||||
fmla z18.d, p1/m, z18.d, alphaZ
|
fmul z18.d, p1/m, z18.d, alphaZ
|
||||||
st1d z18.d, p1, [pCRow2]
|
st1d z18.d, p1, [pCRow2]
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
|
||||||
fmla z19.d, p1/m, z19.d, alphaZ
|
fmul z19.d, p1/m, z19.d, alphaZ
|
||||||
st1d z19.d, p1, [pCRow1]
|
st1d z19.d, p1, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||||
|
@ -392,11 +392,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
|
||||||
add pCRow1, pCRow0, LDC
|
add pCRow1, pCRow0, LDC
|
||||||
fmla z16.d, p1/m, z16.d, alphaZ
|
fmul z16.d, p1/m, z16.d, alphaZ
|
||||||
st1d z16.d, p1, [pCRow0]
|
st1d z16.d, p1, [pCRow0]
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
|
||||||
fmla z17.d, p1/m, z17.d, alphaZ
|
fmul z17.d, p1/m, z17.d, alphaZ
|
||||||
st1d z17.d, p1, [pCRow1]
|
st1d z17.d, p1, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||||
|
@ -426,7 +426,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
|
||||||
fmla z16.d, p1/m, z16.d, alphaZ
|
fmul z16.d, p1/m, z16.d, alphaZ
|
||||||
st1d z16.d, p1, [pCRow0]
|
st1d z16.d, p1, [pCRow0]
|
||||||
|
|
||||||
|
|
||||||
|
|
4
param.h
4
param.h
|
@ -3328,8 +3328,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||||
|
|
||||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
Loading…
Reference in New Issue