POWER10: Rename mma builtins

The LLVM and GCC teams agreed to rename the __builtin_mma_assemble_pair and
__builtin_mma_disassemble_pair built-ins to __builtin_vsx_assemble_pair and
__builtin_vsx_disassemble_pair respectively. This patch is to make
corresponding changes in dgemm kernel. Also made changes in
inputs to those builtins to avoid some potential typecasting issues.

Reference gcc commit id:77ef995c1fbcab76a2a69b9f4700bcfd005d8e62
This commit is contained in:
Rajalakshmi Srinivasaraghavan 2021-02-26 20:56:34 -06:00
parent d12a2d0d04
commit 0571c3187b
1 changed files with 37 additions and 40 deletions

View File

@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
typedef __vector unsigned char vec_t; typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !__has_builtin(__builtin_vsx_disassemble_pair)
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
#endif
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#define SAVE_ACC(ACC, J) \ #define SAVE_ACC(ACC, J) \
@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
vec_t *rb = (vec_t *) & BO[0]; vec_t *rb = (vec_t *) & BO[0];
__vector_pair rowB, rowB1; __vector_pair rowB, rowB1;
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
{ {
rowA = (vec_t *) & AO[l << 3]; rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 3]; rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1; __vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
{ {
rowA = (vec_t *) & AO[l << 2]; rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 3]; rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1; __vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
for (l = 1; l < temp; l++) for (l = 1; l < temp; l++)
{ {
rowA = (vec_t *) & AO[l << 1]; rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 3]; rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
} }
@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB; __vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
{ {
rowA = (vec_t *) & AO[l << 3]; rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 2]; rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB; __vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++) for (l = 1; l < temp; l++)
{ {
rowA = (vec_t *) & AO[l << 2]; rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 2]; rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
} }
@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB; __vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++) for (l = 1; l < temp; l++)
{ {
rowA = (vec_t *) & AO[l << 1]; rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 2]; rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
} }
SAVE_ACC (&acc0, 0); SAVE_ACC (&acc0, 0);
@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
v4sf_t result[4]; v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3; __vector_quad acc0, acc1, acc2, acc3;
BLASLONG l = 0; BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB; __vector_pair rowB;
vec_t *rb = (vec_t *) & t[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
for (l = 1; l < temp; l++) for (l = 1; l < temp; l++)
{ {
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; rb = (vec_t *) & BO[l << 1];
rb = (vec_t *) & t[0]; __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rowA = (vec_t *) & AO[l << 3]; rowA = (vec_t *) & AO[l << 3];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
v4sf_t result[4]; v4sf_t result[4];
__vector_quad acc0, acc1; __vector_quad acc0, acc1;
BLASLONG l = 0; BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB; __vector_pair rowB;
vec_t *rb = (vec_t *) & t[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++) for (l = 1; l < temp; l++)
{ {
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; rb = (vec_t *) & BO[l << 1];
rb = (vec_t *) & t[0]; __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rowA = (vec_t *) & AO[l << 2]; rowA = (vec_t *) & AO[l << 2];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
v4sf_t result[4]; v4sf_t result[4];
__vector_quad acc0; __vector_quad acc0;
BLASLONG l = 0; BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB; __vector_pair rowB;
vec_t *rb = (vec_t *) & t[0]; vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++) for (l = 1; l < temp; l++)
{ {
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; rb = (vec_t *) & BO[l << 1];
rb = (vec_t *) & t[0]; __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rowA = (vec_t *) & AO[l << 1]; rowA = (vec_t *) & AO[l << 1];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
} }