Remove k2 loop from DGEMM TN and use a more conservative heuristic for SGEMM

This commit is contained in:
Chris Sidebottom 2024-07-18 17:37:18 +01:00
parent 8c472ef7e3
commit b1c9fafabb
2 changed files with 2 additions and 211 deletions

View File

@ -265,43 +265,7 @@ CNAME(BLASLONG M,
if (LIKELY(packed_a != NULL)) {
if (j == 0) {
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_true, 0, 1);
VECTOR_PACK_A(0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
VECTOR_LOAD_B_K2(2, 0);
VECTOR_LOAD_B_K2(3, 0);
TRANSPOSE_B2_K2(2, 3, 0, 1);
SCALE_B2_K2(2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1);
GATHER_LOAD_A(pg_true, 1, 0);
VECTOR_PACK_A(1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0);
GATHER_LOAD_A(pg_true, 1, 1);
VECTOR_PACK_A(1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
VECTOR_PACK_A(0, 0);
@ -320,39 +284,7 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
}
} else {
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UNPACK_VECTOR_A(0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
VECTOR_LOAD_B_K2(2, 0);
VECTOR_LOAD_B_K2(3, 0);
TRANSPOSE_B2_K2(2, 3, 0, 1);
SCALE_B2_K2(2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1);
UNPACK_VECTOR_A(1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0);
UNPACK_VECTOR_A(1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
@ -370,37 +302,6 @@ CNAME(BLASLONG M,
}
}
} else {
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_true, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
VECTOR_LOAD_B_K2(2, 0);
VECTOR_LOAD_B_K2(3, 0);
TRANSPOSE_B2_K2(2, 3, 0, 1);
SCALE_B2_K2(2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1);
GATHER_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0);
GATHER_LOAD_A(pg_true, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
@ -443,27 +344,7 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(1, 1);
if (LIKELY(packed_a != NULL)) {
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
UNPACK_VECTOR_A(0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
UNPACK_VECTOR_A(1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
UNPACK_VECTOR_A(1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
UNPACK_VECTOR_A(0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
@ -474,27 +355,7 @@ CNAME(BLASLONG M,
UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
}
} else {
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_true, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
GATHER_LOAD_A(pg_true, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0);
GATHER_LOAD_A(pg_true, 1, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
@ -570,27 +431,6 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_true, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
VECTOR_LOAD_B_K2(2, 0);
VECTOR_LOAD_B_K2(3, 0);
TRANSPOSE_B2_K2(2, 3, 0, 1);
SCALE_B2_K2(2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
@ -619,19 +459,6 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_true, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_true, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
@ -686,27 +513,6 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(0, 2);
DECLARE_RESULT_VECTOR(0, 3);
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_tail, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_tail, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
VECTOR_LOAD_B_K2(2, 0);
VECTOR_LOAD_B_K2(3, 0);
TRANSPOSE_B2_K2(2, 3, 0, 1);
SCALE_B2_K2(2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
@ -735,19 +541,6 @@ CNAME(BLASLONG M,
DECLARE_RESULT_VECTOR(0, 0);
DECLARE_RESULT_VECTOR(0, 1);
for (; k < k2; k += 2) {
VECTOR_LOAD_B_K2(0, 0);
VECTOR_LOAD_B_K2(1, 0);
TRANSPOSE_B2_K2(0, 1, 0, 1);
SCALE_B2_K2(0, 0, 1);
GATHER_LOAD_A(pg_tail, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0);
GATHER_LOAD_A(pg_tail, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1);
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1);
}
for (; k < K; k++) {
BROADCAST_LOAD_B(0, 0);
@ -787,4 +580,4 @@ CNAME(BLASLONG M,
free(packed_a);
return 0;
}
}

View File

@ -35,11 +35,9 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
if (MNK <= 64*64*64)
return 1;
#else // sgemm
if (MNK <= 256*256*256)
if (MNK <= 64*64*64)
return 1;
#endif
return 0;
}