Clean up k2 removal more and unroll SGEMM more

This commit is contained in:
Chris Sidebottom 2024-07-18 17:34:43 +00:00
parent b1c9fafabb
commit 9984c5ce9d
5 changed files with 555 additions and 129 deletions

View File

@ -80,25 +80,12 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k));
#define LOAD_A1(m, offset_k) \
float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k);
#define VECTOR_LOAD_B_K2(n, offset_k) \
float64x2_t b##k##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k));
#define TRANSPOSE_B2_K2(n0, n1, offset_k0, offset_k1) \
float64x2_t b##n0##_k##offset_k0 = \
vzip1q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \
float64x2_t b##n0##_k##offset_k1 = \
vzip2q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0);
#define SCALE_B2_K2(n0, offset_k0, offset_k1) \
svfloat64_t b##s##n0##_k##offset_k0 = svdup_neonq_f64(b##n0##_k##offset_k0); \
svfloat64_t b##s##n0##_k##offset_k1 = svdup_neonq_f64(b##n0##_k##offset_k1);
#define GATHER_LOAD_B2(n, offset_k) \
float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \
b##n##_k##offset_k = \
vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1);
#define VECTOR_UNPACK_B2(n, offset_k) \
float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k));
#define VECTOR_PACK_B2(n, offset_k) \
vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k);
#define PACK_B0(n, offset_k) \
PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0);
#define UPDATE_RESULT_VECTOR2(m, n, offset_k) \
@ -128,9 +115,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k));
#define VECTOR_LOAD_A(pg, m, offset_k) \
svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k));
#define QUADWORD_LOAD_B(n, offset_k) \
svfloat64_t b##s##n##_k##offset_k = \
svld1rq(pg_true, &B_ELEMENT_K(n, offset_k));
#define GATHER_LOAD_A(pg, m, offset_k) \
svfloat64_t a##s##m##_k##offset_k = \
svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec);
@ -226,7 +210,6 @@ CNAME(BLASLONG M,
const BLASLONG v_m1 = M & -v_size;
const BLASLONG n4 = N & -4;
const BLASLONG n2 = N & -2;
const BLASLONG k2 = K & -2;
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
FLOAT* packed_a =
@ -266,6 +249,7 @@ CNAME(BLASLONG M,
if (LIKELY(packed_a != NULL)) {
if (j == 0) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
@ -285,6 +269,7 @@ CNAME(BLASLONG M,
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
@ -345,6 +330,7 @@ CNAME(BLASLONG M,
if (LIKELY(packed_a != NULL)) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
@ -356,6 +342,7 @@ CNAME(BLASLONG M,
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);

View File

@ -237,6 +237,7 @@ CNAME(BLASLONG M,
#endif
{
const uint64_t v_size = svcntw();
const uint64_t v_size2 = v_size * 2;
const svbool_t pg_true = svptrue_b32();
const svbool_t pg_quad = svwhilelt_b32(0, 4);
const svbool_t pg_first = svwhilelt_b32(0, 1);
@ -245,10 +246,11 @@ CNAME(BLASLONG M,
const svfloat32_t beta_vec = svdup_f32(beta);
#endif
const BLASLONG n4 = N & -4;
const BLASLONG v_m2 = M & -v_size2;
const BLASLONG v_m1 = M & -v_size;
const BLASLONG k4 = K & -4;
const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0;
const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
FLOAT* packed_b =
(pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL;
@ -269,16 +271,21 @@ CNAME(BLASLONG M,
CREATE_B_POINTER(3, 3);
BLASLONG i = 0;
for (; i < v_m1; i += v_size) {
for (; i < v_m2; i += v_size2) {
CREATE_A_POINTER(0, 0);
UPDATE_A_POINTER(v_size);
CREATE_A_POINTER(1, v_size);
UPDATE_A_POINTER(v_size2);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
DECLARE_RESULT_VECTOR(1, 0);
DECLARE_RESULT_VECTOR(1, 1);
DECLARE_RESULT_VECTOR(1, 2);
DECLARE_RESULT_VECTOR(1, 3);
if (LIKELY(packed_b != NULL)) {
if (i == 0) {
@ -314,6 +321,26 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1);
VECTOR_LOAD_A(pg_true, 1, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2);
VECTOR_LOAD_A(pg_true, 1, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3);
}
for (; k < K; k++) {
@ -324,12 +351,17 @@ CNAME(BLASLONG M,
BROADCAST_LOAD_B(1, 0);
PACK_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
BROADCAST_LOAD_B(2, 0);
PACK_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
BROADCAST_LOAD_B(3, 0);
PACK_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
}
} else {
for (; k < K; k++) {
@ -340,11 +372,118 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
}
} else {
for (; k < k4; k += 4) {
VECTOR_LOAD_B_K4(0, 0);
VECTOR_LOAD_B_K4(1, 0);
VECTOR_LOAD_B_K4(2, 0);
VECTOR_LOAD_B_K4(3, 0);
TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3);
SCALE_B4_K4(0, 0, 1, 2, 3);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1);
VECTOR_LOAD_A(pg_true, 0, 2);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2);
VECTOR_LOAD_A(pg_true, 0, 3);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1);
VECTOR_LOAD_A(pg_true, 1, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2);
VECTOR_LOAD_A(pg_true, 1, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
BROADCAST_LOAD_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
BROADCAST_LOAD_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 0, 1);
VECTOR_STORE(pg_true, 0, 2);
VECTOR_STORE(pg_true, 0, 3);
VECTOR_STORE(pg_true, 1, 0);
VECTOR_STORE(pg_true, 1, 1);
VECTOR_STORE(pg_true, 1, 2);
VECTOR_STORE(pg_true, 1, 3);
INCR_C_POINTER(0, v_size2);
INCR_C_POINTER(1, v_size2);
INCR_C_POINTER(2, v_size2);
INCR_C_POINTER(3, v_size2);
}
for (; i < v_m1; i += v_size) {
CREATE_A_POINTER(0, 0);
UPDATE_A_POINTER(v_size);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
if (LIKELY(packed_b != NULL)) {
for (; k < K; k++) {
UNPACK_QUADWORD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
}
} else {
for (; k < k4; k += 4) {
VECTOR_LOAD_B_K4(0, 0);
VECTOR_LOAD_B_K4(1, 0);
VECTOR_LOAD_B_K4(2, 0);
@ -478,6 +617,28 @@ CNAME(BLASLONG M,
CREATE_B_POINTER(0, 0);
BLASLONG i = 0;
for (; i < v_m2; i += v_size2) {
CREATE_A_POINTER(0, 0);
CREATE_A_POINTER(1, v_size);
UPDATE_A_POINTER(v_size2);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(1, 0);
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 1, 0);
INCR_C_POINTER(0, v_size2);
}
for (; i < v_m1; i += v_size) {
CREATE_A_POINTER(0, 0);

View File

@ -209,6 +209,7 @@ CNAME(BLASLONG M,
#endif
{
const uint64_t v_size = svcntw();
const uint64_t v_size2 = v_size * 2;
const svbool_t pg_true = svptrue_b32();
const svbool_t pg_quad = svwhilelt_b32(0, 4);
const svbool_t pg_first = svwhilelt_b32(0, 1);
@ -217,9 +218,10 @@ CNAME(BLASLONG M,
const svfloat32_t beta_vec = svdup_f32(beta);
#endif
const BLASLONG n4 = N & -4;
const BLASLONG v_m2 = M & -v_size2;
const BLASLONG v_m1 = M & -v_size;
const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0;
const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
FLOAT* packed_b =
(pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL;
@ -240,16 +242,21 @@ CNAME(BLASLONG M,
CREATE_B_POINTER(3, 3);
BLASLONG i = 0;
for (; i < v_m1; i += v_size) {
for (; i < v_m2; i += v_size2) {
CREATE_A_POINTER(0, 0);
UPDATE_A_POINTER(v_size);
CREATE_A_POINTER(1, v_size);
UPDATE_A_POINTER(v_size2);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
DECLARE_RESULT_VECTOR(1, 0);
DECLARE_RESULT_VECTOR(1, 1);
DECLARE_RESULT_VECTOR(1, 2);
DECLARE_RESULT_VECTOR(1, 3);
if (LIKELY(packed_b != NULL)) {
if (i == 0) {
@ -262,6 +269,11 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
} else {
for (; k < K; k++) {
@ -272,11 +284,66 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
}
} else {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 0, 1);
VECTOR_STORE(pg_true, 0, 2);
VECTOR_STORE(pg_true, 0, 3);
VECTOR_STORE(pg_true, 1, 0);
VECTOR_STORE(pg_true, 1, 1);
VECTOR_STORE(pg_true, 1, 2);
VECTOR_STORE(pg_true, 1, 3);
INCR_C_POINTER(0, v_size2);
INCR_C_POINTER(1, v_size2);
INCR_C_POINTER(2, v_size2);
INCR_C_POINTER(3, v_size2);
}
for (; i < v_m1; i += v_size) {
CREATE_A_POINTER(0, 0);
UPDATE_A_POINTER(v_size);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
if (LIKELY(packed_b != NULL)) {
for (; k < K; k++) {
UNPACK_QUADWORD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
}
} else {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
@ -346,6 +413,28 @@ CNAME(BLASLONG M,
CREATE_B_POINTER(0, 0);
BLASLONG i = 0;
for (; i < v_m2; i += v_size2) {
CREATE_A_POINTER(0, 0);
CREATE_A_POINTER(1, v_size);
UPDATE_A_POINTER(v_size2);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(1, 0);
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
VECTOR_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
VECTOR_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 1, 0);
INCR_C_POINTER(0, v_size2);
}
for (; i < v_m1; i += v_size) {
CREATE_A_POINTER(0, 0);

View File

@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// #undef C_ELEMENT
// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc]
#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m]
#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m]
#define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0)
// ASIMD
@ -206,6 +206,7 @@ CNAME(BLASLONG M,
#endif
{
const uint64_t v_size = svcntw();
const uint64_t v_size2 = v_size * 2;
const svbool_t pg_true = svptrue_b32();
const svbool_t pg_quad = svwhilelt_b32(0, 4);
const svbool_t pg_first = svwhilelt_b32(0, 1);
@ -215,18 +216,153 @@ CNAME(BLASLONG M,
#endif
const svuint32_t lda_vec = svindex_u32(0LL, lda);
const BLASLONG v_m2 = M & -v_size2;
const BLASLONG v_m1 = M & -v_size;
const BLASLONG n4 = N & -4;
const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0;
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
FLOAT* packed_a =
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL;
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL;
FLOAT* a_offset = A;
FLOAT* b_offset = B;
FLOAT* c_offset = C;
BLASLONG i = 0;
for (; i < v_m2; i += v_size2) {
CREATE_C_POINTER(0, 0);
CREATE_C_POINTER(1, v_size);
CREATE_A_POINTER(0, 0);
CREATE_A_POINTER(1, v_size);
BLASLONG j = 0;
for (; j < n4; j += 4) {
CREATE_B_POINTER(0, 0);
CREATE_B_POINTER(1, 1);
CREATE_B_POINTER(2, 2);
CREATE_B_POINTER(3, 3);
UPDATE_B_POINTER(4);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
DECLARE_RESULT_VECTOR(1, 0);
DECLARE_RESULT_VECTOR(1, 1);
DECLARE_RESULT_VECTOR(1, 2);
DECLARE_RESULT_VECTOR(1, 3);
if (LIKELY(packed_a != NULL)) {
if (j == 0) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
BROADCAST_LOAD_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
GATHER_LOAD_A(pg_true, 1, 0);
VECTOR_PACK_A(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
BROADCAST_LOAD_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
BROADCAST_LOAD_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
UNPACK_VECTOR_A(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
BROADCAST_LOAD_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
}
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
BROADCAST_LOAD_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
GATHER_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
BROADCAST_LOAD_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 0, 1);
VECTOR_STORE(pg_true, 0, 2);
VECTOR_STORE(pg_true, 0, 3);
VECTOR_STORE(pg_true, 1, 0);
VECTOR_STORE(pg_true, 1, 1);
VECTOR_STORE(pg_true, 1, 2);
VECTOR_STORE(pg_true, 1, 3);
INCR_C_POINTER(0, 4);
INCR_C_POINTER(1, 4);
}
for (; j < N; j++) {
CREATE_B_POINTER(0, 0);
UPDATE_B_POINTER(1);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(1, 0);
if (LIKELY(packed_a != NULL)) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
UNPACK_VECTOR_A(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
GATHER_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 1, 0);
INCR_C_POINTER(0, 1);
INCR_C_POINTER(1, 1);
}
UPDATE_A_POINTER(v_size2);
RESET_B_POINTER();
UPDATE_C_POINTER(v_size2);
}
for (; i < v_m1; i += v_size) {
CREATE_C_POINTER(0, 0);
@ -247,36 +383,6 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
if (LIKELY(packed_a != NULL)) {
if (j == 0) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
BROADCAST_LOAD_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
BROADCAST_LOAD_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
BROADCAST_LOAD_B(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
BROADCAST_LOAD_B(2, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
}
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
@ -289,7 +395,6 @@ CNAME(BLASLONG M,
BROADCAST_LOAD_B(3, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 0, 1);
VECTOR_STORE(pg_true, 0, 2);
@ -304,21 +409,12 @@ CNAME(BLASLONG M,
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
if (LIKELY(packed_a != NULL)) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
INCR_C_POINTER(0, 1);
}

View File

@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// #undef C_ELEMENT
// #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc]
#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m]
#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m]
#define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0)
// ASIMD
@ -207,6 +207,7 @@ CNAME(BLASLONG M,
#endif
{
const uint64_t v_size = svcntw();
const uint64_t v_size2 = v_size * 2;
const svbool_t pg_true = svptrue_b32();
const svbool_t pg_quad = svwhilelt_b32(0, 4);
const svbool_t pg_first = svwhilelt_b32(0, 1);
@ -216,18 +217,144 @@ CNAME(BLASLONG M,
#endif
const svuint32_t lda_vec = svindex_u32(0LL, lda);
const BLASLONG v_m2 = M & -v_size2;
const BLASLONG v_m1 = M & -v_size;
const BLASLONG n4 = N & -4;
const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0;
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
FLOAT* packed_a =
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL;
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL;
FLOAT* a_offset = A;
FLOAT* b_offset = B;
FLOAT* c_offset = C;
BLASLONG i = 0;
for (; i < v_m2; i += v_size2) {
CREATE_C_POINTER(0, 0);
CREATE_C_POINTER(1, v_size);
CREATE_A_POINTER(0, 0);
CREATE_A_POINTER(1, v_size);
BLASLONG j = 0;
for (; j < n4; j += 4) {
CREATE_B_POINTER(0, 0);
CREATE_B_POINTER(1, 1);
CREATE_B_POINTER(2, 2);
CREATE_B_POINTER(3, 3);
UPDATE_B_POINTER(4);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
DECLARE_RESULT_VECTOR(1, 0);
DECLARE_RESULT_VECTOR(1, 1);
DECLARE_RESULT_VECTOR(1, 2);
DECLARE_RESULT_VECTOR(1, 3);
if (LIKELY(packed_a != NULL)) {
if (j == 0) {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
GATHER_LOAD_A(pg_true, 1, 0);
VECTOR_PACK_A(1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
} else {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
UNPACK_VECTOR_A(1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
}
} else {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
GATHER_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 0, 1);
VECTOR_STORE(pg_true, 0, 2);
VECTOR_STORE(pg_true, 0, 3);
VECTOR_STORE(pg_true, 1, 0);
VECTOR_STORE(pg_true, 1, 1);
VECTOR_STORE(pg_true, 1, 2);
VECTOR_STORE(pg_true, 1, 3);
INCR_C_POINTER(0, 4);
INCR_C_POINTER(1, 4);
}
for (; j < N; j++) {
CREATE_B_POINTER(0, 0);
UPDATE_B_POINTER(1);
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(1, 0);
if (LIKELY(packed_a != NULL)) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
UNPACK_VECTOR_A(1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
GATHER_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 1, 0);
INCR_C_POINTER(0, 1);
INCR_C_POINTER(1, 1);
}
UPDATE_A_POINTER(v_size2);
RESET_B_POINTER();
UPDATE_C_POINTER(v_size2);
}
for (; i < v_m1; i += v_size) {
CREATE_C_POINTER(0, 0);
@ -248,30 +375,6 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
if (LIKELY(packed_a != NULL)) {
if (j == 0) {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
}
} else {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
}
}
} else {
for (; k < K; k++) {
QUADWORD_LOAD_B(0, 0);
@ -281,7 +384,6 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
VECTOR_STORE(pg_true, 0, 1);
VECTOR_STORE(pg_true, 0, 2);
@ -296,21 +398,12 @@ CNAME(BLASLONG M,
BLASLONG k = 0;
DECLARE_RESULT_VECTOR(0, 0);
if (LIKELY(packed_a != NULL)) {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
}
} else {
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
}
}
VECTOR_STORE(pg_true, 0, 0);
INCR_C_POINTER(0, 1);
}