diff --git a/Makefile.system b/Makefile.system index cc72c02e8..db651ef99 100644 --- a/Makefile.system +++ b/Makefile.system @@ -617,7 +617,6 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -627,11 +626,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs else ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b3ee301be..a0bc1a777 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ + rowC[0] = result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] = result[1] * alpha; \ - rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; -#define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ - rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ + rowC[0] += result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[1] * alpha; \ - rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; -#define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ - rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define SET_ACC_ZERO4() \ diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 01c122c6d..81a5ec76b 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #if defined(TRMMKERNEL) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ + rowC[0] = result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] = result[1] * alpha; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[6] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[4] * alpha; \ + rowC[0] = result[2] * alpha; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] = result[0] * alpha; -#define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[6] * alpha; \ - rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] = result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ + rowC[0] += result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[1] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; -#define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ - rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define KERNEL(i, j) \ __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c index 7455f925c..1ae9e04bf 100644 --- a/kernel/power/shgemm_kernel_power10.c +++ b/kernel/power/shgemm_kernel_power10.c @@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) #define BF16TOF32(x) x #endif -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); @@ -64,54 +64,54 @@ vector char mask = #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ + rowC[0] += result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[1] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; -#define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ - rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #define SET_ACC_ZERO4() \ __builtin_mma_xxsetaccz (&acc0); \