From 7947970f9d5d88a9399c691a0911689c592f5d37 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 13 Sep 2024 06:22:13 -0500 Subject: [PATCH] Move common code. --- kernel/power/gemm_common.c | 148 +++++++++++++++++++++++++++++++++++ kernel/power/sbgemv_common.c | 133 +------------------------------ kernel/power/sbgemv_n.c | 2 +- kernel/power/sbgemv_n_vsx.c | 3 +- 4 files changed, 152 insertions(+), 134 deletions(-) create mode 100644 kernel/power/gemm_common.c diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c new file mode 100644 index 000000000..c33faffe0 --- /dev/null +++ b/kernel/power/gemm_common.c @@ -0,0 +1,148 @@ +#ifndef GEMM_COMMON_C +#define GEMM_COMMON_C +#include "common.h" + +#include + +#define FORCEINLINE inline __attribute__((always_inline)) + +#ifdef __clang__ +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long +#endif + +#ifdef _ARCH_PWR10 +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0) +#else +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1) +#endif + +#define USE_VECTOR_PAIRS +#endif + +typedef __vector IFLOAT vec_bf16; +typedef __vector FLOAT vec_f32; +typedef __vector unsigned char vec_uc8; + +FORCEINLINE vec_uc8 vec_load_vec(void *src) +{ + return vec_xl(0, (unsigned char *)(src)); +} + +FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + vy0p = *(__vector_pair *)(src); + __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); + *(__vector_pair *)(dst) = vy0p; +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n) +{ + IFLOAT *src2 = (IFLOAT *)(src); +#ifdef _ARCH_PWR9 + return vec_xl_len(src2, n * sizeof(IFLOAT)); +#else + __attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)]; + memset(data, 0, sizeof(vec_bf16)); + if (n & 4) { + memcpy(data, src2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(data + n4, src2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + data[n6] = src2[n6]; + } + return (vec_bf16)vec_load_vec(data); +#endif +} + +FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + return (vec_f32)vec_load_vec(src); + } +#endif + return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n) +{ + data[0] = src[0]; + data[1] = vec_loadN_f32(&src[1], n); +} + +FORCEINLINE void vec_storeN(vec_bf16 data, void *dst, BLASLONG n) +{ + IFLOAT *dst2 = (IFLOAT *)(dst); +#ifdef _ARCH_PWR9 + vec_xst_len(data, dst2, n * sizeof(IFLOAT)); +#else + if (n & 8) { + vec_xst(data, 0, dst2); + return; + } + __attribute__((aligned(16))) IFLOAT data2[sizeof(vec_f32) / sizeof(IFLOAT)]; + vec_xst(data, 0, data2); + if (n & 4) { + memcpy(dst2, data2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(dst2 + n4, data2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + dst2[n6] = data2[n6]; + } +#endif +} + +FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + vec_xst(data, 0, (FLOAT *)dst); + return; + } +#endif + return vec_storeN((vec_bf16)data, dst, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n) +{ + dst[0] = data[0]; + vec_storeN_f32(data[1], &dst[1], n); +} +#endif diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 07f75d318..46dee74c3 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -27,40 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef SBGEMV_COMMON_C #define SBGEMV_COMMON_C -#include "common.h" - -#include - -#define FORCEINLINE inline __attribute__((always_inline)) - -#ifdef __clang__ -#define uint16_t unsigned short -#define uint32_t unsigned int -#define uint64_t unsigned long long -#endif - -#ifdef _ARCH_PWR10 -#ifdef __has_builtin -#if !__has_builtin(__builtin_vsx_assemble_pair) -#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair -#endif -#if !__has_builtin(__builtin_vsx_disassemble_pair) -#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair -#endif -#endif - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0) -#else -#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1) -#endif - -#define USE_VECTOR_PAIRS -#endif - -typedef __vector IFLOAT vec_bf16; -typedef __vector FLOAT vec_f32; -typedef __vector unsigned char vec_uc8; +#include "gemm_common.c" #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BF16_HI(data, zero) (vec_f32)vec_mergeh(data, zero) @@ -70,108 +37,12 @@ typedef __vector unsigned char vec_uc8; #define BF16_LO(data, zero) (vec_f32)vec_mergel(zero, data) #endif -FORCEINLINE vec_uc8 vec_load_vec(void *src) -{ - return vec_xl(0, (unsigned char *)(src)); -} - -FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) -{ -#ifdef USE_VECTOR_PAIRS - __vector_pair vy0p; - vy0p = *(__vector_pair *)(src); - __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); -#else - dst[0] = src[0]; - dst[1] = src[1]; -#endif -} - -FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) -{ -#ifdef USE_VECTOR_PAIRS - __vector_pair vy0p; - __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); - *(__vector_pair *)(dst) = vy0p; -#else - dst[0] = src[0]; - dst[1] = src[1]; -#endif -} - -FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n) -{ - IFLOAT *src2 = (IFLOAT *)(src); -#ifdef _ARCH_PWR9 - return vec_xl_len(src2, n * sizeof(IFLOAT)); -#else - __attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)]; - memset(data, 0, sizeof(vec_bf16)); - if (n & 4) { - memcpy(data, src2, sizeof(uint64_t)); - } - if (n & 2) { - BLASLONG n4 = n & 4; - memcpy(data + n4, src2 + n4, sizeof(uint32_t)); - } - if (n & 1) { - BLASLONG n6 = n & 6; - data[n6] = src2[n6]; - } - return (vec_bf16)vec_load_vec(data); -#endif -} - FORCEINLINE vec_f32 vec_loadNHi(void *src, BLASLONG n, vec_bf16 zero) { vec_bf16 data = vec_loadN(src, n); return BF16_HI(data, zero); } -FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) -{ -#ifndef _ARCH_PWR9 - if (n & 4) { - return (vec_f32)vec_load_vec(src); - } -#endif - return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); -} - -FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n) -{ - data[0] = src[0]; - data[1] = vec_loadN_f32(&src[1], n); -} - -FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) -{ - FLOAT *dst2 = (FLOAT *)(dst); -#ifdef _ARCH_PWR9 - vec_xst_len(data, dst2, n * sizeof(FLOAT)); -#else - if (n & 4) { - vec_xst(data, 0, dst2); - return; - } - __attribute__((aligned(16))) FLOAT data2[sizeof(vec_f32) / sizeof(FLOAT)]; - vec_xst(data, 0, data2); - if (n & 2) { - memcpy(dst2, data2, sizeof(uint64_t)); - } - if (n & 1) { - BLASLONG n2 = n & 2; - dst2[n2] = data2[n2]; - } -#endif -} - -FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n) -{ - dst[0] = data[0]; - vec_storeN_f32(data[1], &dst[1], n); -} - FORCEINLINE vec_f32 vec_mult(vec_f32 *inp, vec_bf16 in0, vec_bf16 zero) { vec_f32 v_in00 = BF16_HI(in0, zero); @@ -297,7 +168,7 @@ FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, F } } -FORCEINLINE void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { for (BLASLONG i = 0; i < n; i++) { *dest = *src++; diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index 05c02a006..c7559a47c 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * a += NB; if (inc_y != 1) { - add_y(NB, ybuffer, y_ptr, inc_y); + move_y(NB, ybuffer, y_ptr, inc_y); y_ptr += (NB * inc_y); } else { y_ptr += NB; diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c index 45570950e..cab2316d4 100644 --- a/kernel/power/sbgemv_n_vsx.c +++ b/kernel/power/sbgemv_n_vsx.c @@ -269,8 +269,7 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_loadN_mult2(v_x7, &vb3[i], n, zero, vy0); vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); - } else - if (n) { + } else if (n) { vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero);