diff --git a/kernel/arm64/dgemm_small_kernel_tn_sve.c b/kernel/arm64/dgemm_small_kernel_tn_sve.c index 6d3f4dd28..1b0fada2a 100644 --- a/kernel/arm64/dgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/dgemm_small_kernel_tn_sve.c @@ -80,25 +80,12 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); #define LOAD_A1(m, offset_k) \ float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); -#define VECTOR_LOAD_B_K2(n, offset_k) \ - float64x2_t b##k##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); -#define TRANSPOSE_B2_K2(n0, n1, offset_k0, offset_k1) \ - float64x2_t b##n0##_k##offset_k0 = \ - vzip1q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ - float64x2_t b##n0##_k##offset_k1 = \ - vzip2q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); - -#define SCALE_B2_K2(n0, offset_k0, offset_k1) \ - svfloat64_t b##s##n0##_k##offset_k0 = svdup_neonq_f64(b##n0##_k##offset_k0); \ - svfloat64_t b##s##n0##_k##offset_k1 = svdup_neonq_f64(b##n0##_k##offset_k1); #define GATHER_LOAD_B2(n, offset_k) \ float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ b##n##_k##offset_k = \ vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); #define VECTOR_UNPACK_B2(n, offset_k) \ float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); -#define VECTOR_PACK_B2(n, offset_k) \ - vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); #define PACK_B0(n, offset_k) \ PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ @@ -128,9 +115,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); #define VECTOR_LOAD_A(pg, m, offset_k) \ svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); -#define QUADWORD_LOAD_B(n, offset_k) \ - svfloat64_t b##s##n##_k##offset_k = \ - svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); #define GATHER_LOAD_A(pg, m, offset_k) \ svfloat64_t a##s##m##_k##offset_k = \ svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); @@ -226,7 +210,6 @@ CNAME(BLASLONG M, const BLASLONG v_m1 = M & -v_size; const BLASLONG n4 = N & -4; const BLASLONG n2 = N & -2; - const BLASLONG k2 = K & -2; const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; FLOAT* packed_a = @@ -266,6 +249,7 @@ CNAME(BLASLONG M, if (LIKELY(packed_a != NULL)) { if (j == 0) { for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); GATHER_LOAD_A(pg_true, 0, 0); VECTOR_PACK_A(0, 0); @@ -285,6 +269,7 @@ CNAME(BLASLONG M, } } else { for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); UNPACK_VECTOR_A(0, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); @@ -345,6 +330,7 @@ CNAME(BLASLONG M, if (LIKELY(packed_a != NULL)) { for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); UNPACK_VECTOR_A(0, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); @@ -356,6 +342,7 @@ CNAME(BLASLONG M, } } else { for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); GATHER_LOAD_A(pg_true, 0, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); @@ -580,4 +567,4 @@ CNAME(BLASLONG M, free(packed_a); return 0; -} +} \ No newline at end of file diff --git a/kernel/arm64/sgemm_small_kernel_nn_sve.c b/kernel/arm64/sgemm_small_kernel_nn_sve.c index 2e65e61ff..0af073a14 100644 --- a/kernel/arm64/sgemm_small_kernel_nn_sve.c +++ b/kernel/arm64/sgemm_small_kernel_nn_sve.c @@ -237,6 +237,7 @@ CNAME(BLASLONG M, #endif { const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; const svbool_t pg_true = svptrue_b32(); const svbool_t pg_quad = svwhilelt_b32(0, 4); const svbool_t pg_first = svwhilelt_b32(0, 1); @@ -245,10 +246,11 @@ CNAME(BLASLONG M, const svfloat32_t beta_vec = svdup_f32(beta); #endif const BLASLONG n4 = N & -4; + const BLASLONG v_m2 = M & -v_size2; const BLASLONG v_m1 = M & -v_size; const BLASLONG k4 = K & -4; - const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; + const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; FLOAT* packed_b = (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; @@ -269,16 +271,21 @@ CNAME(BLASLONG M, CREATE_B_POINTER(3, 3); BLASLONG i = 0; - for (; i < v_m1; i += v_size) { + for (; i < v_m2; i += v_size2) { CREATE_A_POINTER(0, 0); - UPDATE_A_POINTER(v_size); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); BLASLONG k = 0; DECLARE_RESULT_VECTOR(0, 0); DECLARE_RESULT_VECTOR(0, 1); DECLARE_RESULT_VECTOR(0, 2); DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); if (LIKELY(packed_b != NULL)) { if (i == 0) { @@ -314,6 +321,26 @@ CNAME(BLASLONG M, UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); } for (; k < K; k++) { @@ -324,12 +351,17 @@ CNAME(BLASLONG M, BROADCAST_LOAD_B(1, 0); PACK_B(1, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); BROADCAST_LOAD_B(2, 0); PACK_B(2, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); BROADCAST_LOAD_B(3, 0); PACK_B(3, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); } } else { for (; k < K; k++) { @@ -340,11 +372,118 @@ CNAME(BLASLONG M, UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); } } } else { for (; k < k4; k += 4) { + VECTOR_LOAD_B_K4(0, 0); + VECTOR_LOAD_B_K4(1, 0); + VECTOR_LOAD_B_K4(2, 0); + VECTOR_LOAD_B_K4(3, 0); + TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); + SCALE_B4_K4(0, 0, 1, 2, 3); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); + VECTOR_LOAD_A(pg_true, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); + VECTOR_LOAD_A(pg_true, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); + } + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + INCR_C_POINTER(2, v_size2); + INCR_C_POINTER(3, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } else { + for (; k < k4; k += 4) { + VECTOR_LOAD_B_K4(0, 0); VECTOR_LOAD_B_K4(1, 0); VECTOR_LOAD_B_K4(2, 0); @@ -478,6 +617,28 @@ CNAME(BLASLONG M, CREATE_B_POINTER(0, 0); BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, v_size2); + } for (; i < v_m1; i += v_size) { CREATE_A_POINTER(0, 0); diff --git a/kernel/arm64/sgemm_small_kernel_nt_sve.c b/kernel/arm64/sgemm_small_kernel_nt_sve.c index 9f99c2422..ed7ee6bd6 100644 --- a/kernel/arm64/sgemm_small_kernel_nt_sve.c +++ b/kernel/arm64/sgemm_small_kernel_nt_sve.c @@ -209,6 +209,7 @@ CNAME(BLASLONG M, #endif { const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; const svbool_t pg_true = svptrue_b32(); const svbool_t pg_quad = svwhilelt_b32(0, 4); const svbool_t pg_first = svwhilelt_b32(0, 1); @@ -217,9 +218,10 @@ CNAME(BLASLONG M, const svfloat32_t beta_vec = svdup_f32(beta); #endif const BLASLONG n4 = N & -4; + const BLASLONG v_m2 = M & -v_size2; const BLASLONG v_m1 = M & -v_size; - const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; + const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; FLOAT* packed_b = (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; @@ -240,16 +242,21 @@ CNAME(BLASLONG M, CREATE_B_POINTER(3, 3); BLASLONG i = 0; - for (; i < v_m1; i += v_size) { + for (; i < v_m2; i += v_size2) { CREATE_A_POINTER(0, 0); - UPDATE_A_POINTER(v_size); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); BLASLONG k = 0; DECLARE_RESULT_VECTOR(0, 0); DECLARE_RESULT_VECTOR(0, 1); DECLARE_RESULT_VECTOR(0, 2); DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); if (LIKELY(packed_b != NULL)) { if (i == 0) { @@ -262,6 +269,11 @@ CNAME(BLASLONG M, UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); } } else { for (; k < K; k++) { @@ -272,11 +284,66 @@ CNAME(BLASLONG M, UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); } } } else { for (; k < K; k++) { + QUADWORD_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, v_size2); + INCR_C_POINTER(1, v_size2); + INCR_C_POINTER(2, v_size2); + INCR_C_POINTER(3, v_size2); + } + for (; i < v_m1; i += v_size) { + + CREATE_A_POINTER(0, 0); + UPDATE_A_POINTER(v_size); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + + if (LIKELY(packed_b != NULL)) { + for (; k < K; k++) { + + UNPACK_QUADWORD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + } + } else { + for (; k < K; k++) { + QUADWORD_LOAD_B(0, 0); VECTOR_LOAD_A(pg_true, 0, 0); UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); @@ -346,6 +413,28 @@ CNAME(BLASLONG M, CREATE_B_POINTER(0, 0); BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + UPDATE_A_POINTER(v_size2); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + VECTOR_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + VECTOR_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, v_size2); + } for (; i < v_m1; i += v_size) { CREATE_A_POINTER(0, 0); diff --git a/kernel/arm64/sgemm_small_kernel_tn_sve.c b/kernel/arm64/sgemm_small_kernel_tn_sve.c index 9cbb60d40..54608a47b 100644 --- a/kernel/arm64/sgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/sgemm_small_kernel_tn_sve.c @@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #undef C_ELEMENT // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] -#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m] +#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) // ASIMD @@ -206,6 +206,7 @@ CNAME(BLASLONG M, #endif { const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; const svbool_t pg_true = svptrue_b32(); const svbool_t pg_quad = svwhilelt_b32(0, 4); const svbool_t pg_first = svwhilelt_b32(0, 1); @@ -215,18 +216,153 @@ CNAME(BLASLONG M, #endif const svuint32_t lda_vec = svindex_u32(0LL, lda); + const BLASLONG v_m2 = M & -v_size2; const BLASLONG v_m1 = M & -v_size; const BLASLONG n4 = N & -4; - const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; FLOAT* packed_a = - (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL; + (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; FLOAT* a_offset = A; FLOAT* b_offset = B; FLOAT* c_offset = C; BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, v_size); + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + if (j == 0) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + VECTOR_PACK_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + VECTOR_PACK_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, 4); + INCR_C_POINTER(1, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, 1); + INCR_C_POINTER(1, 1); + } + + UPDATE_A_POINTER(v_size2); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size2); + } for (; i < v_m1; i += v_size) { CREATE_C_POINTER(0, 0); @@ -247,48 +383,17 @@ CNAME(BLASLONG M, DECLARE_RESULT_VECTOR(0, 2); DECLARE_RESULT_VECTOR(0, 3); - if (LIKELY(packed_a != NULL)) { - if (j == 0) { - for (; k < K; k++) { + for (; k < K; k++) { - BROADCAST_LOAD_B(0, 0); - GATHER_LOAD_A(pg_true, 0, 0); - VECTOR_PACK_A(0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - BROADCAST_LOAD_B(1, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); - BROADCAST_LOAD_B(2, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); - BROADCAST_LOAD_B(3, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); - } - } else { - for (; k < K; k++) { - - BROADCAST_LOAD_B(0, 0); - UNPACK_VECTOR_A(0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - BROADCAST_LOAD_B(1, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); - BROADCAST_LOAD_B(2, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); - BROADCAST_LOAD_B(3, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); - } - } - } else { - for (; k < K; k++) { - - BROADCAST_LOAD_B(0, 0); - GATHER_LOAD_A(pg_true, 0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - BROADCAST_LOAD_B(1, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); - BROADCAST_LOAD_B(2, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); - BROADCAST_LOAD_B(3, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); - } + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); } VECTOR_STORE(pg_true, 0, 0); VECTOR_STORE(pg_true, 0, 1); @@ -304,20 +409,11 @@ CNAME(BLASLONG M, BLASLONG k = 0; DECLARE_RESULT_VECTOR(0, 0); - if (LIKELY(packed_a != NULL)) { - for (; k < K; k++) { + for (; k < K; k++) { - BROADCAST_LOAD_B(0, 0); - UNPACK_VECTOR_A(0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - } - } else { - for (; k < K; k++) { - - BROADCAST_LOAD_B(0, 0); - GATHER_LOAD_A(pg_true, 0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - } + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); } VECTOR_STORE(pg_true, 0, 0); INCR_C_POINTER(0, 1); diff --git a/kernel/arm64/sgemm_small_kernel_tt_sve.c b/kernel/arm64/sgemm_small_kernel_tt_sve.c index dd9840c37..50dbd7399 100644 --- a/kernel/arm64/sgemm_small_kernel_tt_sve.c +++ b/kernel/arm64/sgemm_small_kernel_tt_sve.c @@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #undef C_ELEMENT // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] -#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m] +#define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) // ASIMD @@ -207,6 +207,7 @@ CNAME(BLASLONG M, #endif { const uint64_t v_size = svcntw(); + const uint64_t v_size2 = v_size * 2; const svbool_t pg_true = svptrue_b32(); const svbool_t pg_quad = svwhilelt_b32(0, 4); const svbool_t pg_first = svwhilelt_b32(0, 1); @@ -216,18 +217,144 @@ CNAME(BLASLONG M, #endif const svuint32_t lda_vec = svindex_u32(0LL, lda); + const BLASLONG v_m2 = M & -v_size2; const BLASLONG v_m1 = M & -v_size; const BLASLONG n4 = N & -4; - const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; FLOAT* packed_a = - (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL; + (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; FLOAT* a_offset = A; FLOAT* b_offset = B; FLOAT* c_offset = C; BLASLONG i = 0; + for (; i < v_m2; i += v_size2) { + + CREATE_C_POINTER(0, 0); + CREATE_C_POINTER(1, v_size); + CREATE_A_POINTER(0, 0); + CREATE_A_POINTER(1, v_size); + + BLASLONG j = 0; + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + if (j == 0) { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + VECTOR_PACK_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + GATHER_LOAD_A(pg_true, 1, 0); + VECTOR_PACK_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + } else { + for (; k < K; k++) { + + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + INCR_C_POINTER(0, 4); + INCR_C_POINTER(1, 4); + } + for (; j < N; j++) { + + CREATE_B_POINTER(0, 0); + UPDATE_B_POINTER(1); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(1, 0); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } else { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 1, 0); + INCR_C_POINTER(0, 1); + INCR_C_POINTER(1, 1); + } + + UPDATE_A_POINTER(v_size2); + RESET_B_POINTER(); + UPDATE_C_POINTER(v_size2); + } for (; i < v_m1; i += v_size) { CREATE_C_POINTER(0, 0); @@ -248,39 +375,14 @@ CNAME(BLASLONG M, DECLARE_RESULT_VECTOR(0, 2); DECLARE_RESULT_VECTOR(0, 3); - if (LIKELY(packed_a != NULL)) { - if (j == 0) { - for (; k < K; k++) { + for (; k < K; k++) { - QUADWORD_LOAD_B(0, 0); - GATHER_LOAD_A(pg_true, 0, 0); - VECTOR_PACK_A(0, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); - } - } else { - for (; k < K; k++) { - - QUADWORD_LOAD_B(0, 0); - UNPACK_VECTOR_A(0, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); - } - } - } else { - for (; k < K; k++) { - - QUADWORD_LOAD_B(0, 0); - GATHER_LOAD_A(pg_true, 0, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); - UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); - } + QUADWORD_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); + UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); } VECTOR_STORE(pg_true, 0, 0); VECTOR_STORE(pg_true, 0, 1); @@ -296,20 +398,11 @@ CNAME(BLASLONG M, BLASLONG k = 0; DECLARE_RESULT_VECTOR(0, 0); - if (LIKELY(packed_a != NULL)) { - for (; k < K; k++) { + for (; k < K; k++) { - BROADCAST_LOAD_B(0, 0); - UNPACK_VECTOR_A(0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - } - } else { - for (; k < K; k++) { - - BROADCAST_LOAD_B(0, 0); - GATHER_LOAD_A(pg_true, 0, 0); - UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); - } + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); } VECTOR_STORE(pg_true, 0, 0); INCR_C_POINTER(0, 1);