sbgemm: spr: only load A once in tail_k handling

This commit is contained in:
Wangyang Guo 2021-09-16 01:04:01 -07:00
parent 9ab33228bb
commit f2485352a6
1 changed files with 34 additions and 28 deletions

View File

@ -317,17 +317,19 @@ tail_k:
n_count = n;
lda = remain_k2;
ldb = 32;
TCONF_TAIL(cfg, tail_m, 16, remain_k2);
for (; n_count > 15; n_count -= 16) {
ptr_b0 = ptr_b + 16 * k32;
ptr_b1 = ptr_b + 16 * k2;
LOAD_C(0, 0);
if (n_count > 15) {
TCONF_TAIL(cfg, tail_m, 16, remain_k2);
LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x);
LOAD_B(x, 0); LOAD_B_TAIL(x, 1);
MATMUL(0, 0); MATMUL_TAIL(1, 1);
STORE_C(0, 0);
ptr_b += 16 * k;
ptr_c00 += 16;
for (; n_count > 15; n_count -= 16) {
ptr_b0 = ptr_b + 16 * k32;
ptr_b1 = ptr_b + 16 * k2;
LOAD_C(0, 0);
LOAD_B(x, 0); LOAD_B_TAIL(x, 1);
MATMUL(0, 0); MATMUL_TAIL(1, 1);
STORE_C(0, 0);
ptr_b += 16 * k;
ptr_c00 += 16;
}
}
if (n_count > 0) {
int tail_n = (n_count > 16) ? 16: n_count;
@ -356,16 +358,18 @@ tail_k:
n_count = n;
lda = remain_k2;
ldb = 32;
TCONF(cfg, tail_m, 16, remain_k2);
for (; n_count > 15; n_count -= 16) {
ptr_b0 = ptr_b + 16 * k32;
LOAD_C(0, 0);
if (n_count > 15) {
TCONF(cfg, tail_m, 16, remain_k2);
LOAD_A(0, x);
LOAD_B(x, 0);
MATMUL(0, 0);
STORE_C(0, 0);
ptr_b += 16 * k;
ptr_c00 += 16;
for (; n_count > 15; n_count -= 16) {
ptr_b0 = ptr_b + 16 * k32;
LOAD_C(0, 0);
LOAD_B(x, 0);
MATMUL(0, 0);
STORE_C(0, 0);
ptr_b += 16 * k;
ptr_c00 += 16;
}
}
if (n_count > 0) {
int tail_n = (n_count > 16) ? 16: n_count;
@ -390,16 +394,18 @@ tail_k:
ptr_c00 = ptr_c;
ptr_c += tail_m * ldc;
n_count = n;
TCONF(cfg, tail_m, 16, 2);
for (; n_count > 15; n_count -= 16) {
ptr_b0 = ptr_b + 16 * k2;
LOAD_C(0, 0);
if (n_count > 15) {
TCONF(cfg, tail_m, 16, 2);
MASK_LOAD_A_TAIL(0, x);
LOAD_B_TAIL(x, 0);
MATMUL(0, 0);
STORE_C(0, 0);
ptr_b += 16 * k;
ptr_c00 += 16;
for (; n_count > 15; n_count -= 16) {
ptr_b0 = ptr_b + 16 * k2;
LOAD_C(0, 0);
LOAD_B_TAIL(x, 0);
MATMUL(0, 0);
STORE_C(0, 0);
ptr_b += 16 * k;
ptr_c00 += 16;
}
}
if (n_count > 0) {
int tail_n = (n_count > 16) ? 16: n_count;