sbgemm: spr: optimization for tmp_c buffer
This commit is contained in:
parent
f018aa342a
commit
6bc8204ce5
|
@ -170,11 +170,20 @@ int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFL
|
||||||
BLASLONG n_count, k_count;
|
BLASLONG n_count, k_count;
|
||||||
|
|
||||||
#ifndef ALPHA_ONE
|
#ifndef ALPHA_ONE
|
||||||
FLOAT *tmp_c = malloc(sizeof(FLOAT) * m * n);
|
// make sure each row is 64 bytes aligned
|
||||||
memset(tmp_c, 0, sizeof(FLOAT) * m * n);
|
BLASLONG cn = (n & 31) ? (n & ~31) + 32 : n;
|
||||||
|
FLOAT *raw_tmp_c;
|
||||||
|
if (k < 32) {
|
||||||
|
// only need to zero buff in this situation
|
||||||
|
raw_tmp_c = (FLOAT *)calloc(1, sizeof(FLOAT) * m * cn + 64);
|
||||||
|
} else {
|
||||||
|
raw_tmp_c = (FLOAT *)malloc(sizeof(FLOAT) * m * cn + 64);
|
||||||
|
}
|
||||||
|
// align buf to 64 byte boundary
|
||||||
|
FLOAT *tmp_c = (FLOAT *)(((uintptr_t) raw_tmp_c + 63) & ~(uintptr_t)63);
|
||||||
ptr_c = tmp_c;
|
ptr_c = tmp_c;
|
||||||
BLASLONG ldc_o = ldc;
|
BLASLONG ldc_o = ldc;
|
||||||
ldc = n;
|
ldc = cn;
|
||||||
#endif
|
#endif
|
||||||
IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64)));
|
IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64)));
|
||||||
IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64)));
|
IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64)));
|
||||||
|
@ -515,7 +524,7 @@ int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFL
|
||||||
MASK_APLPHA_STORE(0);
|
MASK_APLPHA_STORE(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(tmp_c);
|
free(raw_tmp_c);
|
||||||
#endif
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue