From 1265eee85c304c2b7d33d5b48d6128de28acb1ca Mon Sep 17 00:00:00 2001 From: psykose Date: Fri, 9 Aug 2024 20:38:05 +0200 Subject: [PATCH 1/8] fix cmake typo for power10 cc version check fixes 668f48f4fc80db2d886576f20b7d4ddb6defd4c1 --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 683c3181d..a0b73ddae 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -263,7 +263,7 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL POWER10) - if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") else () message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.") From 7ca835a82c5cb315997949804f134f32d9a14b70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Aug 2024 13:44:56 +0200 Subject: [PATCH 2/8] address clang array overflow warning --- kernel/x86_64/sbgemv_t_microk_cooperlake_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c index 8a3a022fb..69370e744 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -2680,7 +2680,7 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b BLASLONG tag_n_32x = n & (~31); BLASLONG tag_n_128x = n & (~127); - __m512 accum512_bridge[8]; + __m512 accum512_bridge[16]; __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; __m256 accum256_0; __m128 accum128; From 824306baabbf91555c07f81ec98b594584dc5952 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Aug 2024 14:44:13 +0200 Subject: [PATCH 3/8] flesh out HERK prototype --- lapack/potrf/potrf_L_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 7d6bcd776..6a2e4d430 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LN, sa, sb, args -> nthreads); #endif } } From 73e13b027381833a003f42790ddcd4ff087e9798 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Aug 2024 14:45:40 +0200 Subject: [PATCH 4/8] flesh out HERK prototype --- lapack/potrf/potrf_U_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 1f1427276..de7d33374 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UC, sa, sb, args -> nthreads); #endif } } From d8f740791a6f21e6d40c879bf2d8e127c4627d73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Aug 2024 14:50:49 +0200 Subject: [PATCH 5/8] tweak threshold a little more to cover POWER10 fma --- lapack-netlib/TESTING/stest_rfp.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/stest_rfp.in b/lapack-netlib/TESTING/stest_rfp.in index 9b082b7df..0e391aacc 100644 --- a/lapack-netlib/TESTING/stest_rfp.in +++ b/lapack-netlib/TESTING/stest_rfp.in @@ -5,5 +5,5 @@ Data file for testing REAL LAPACK linear equation routines RFP format 1 2 15 Values of NRHS (number of right hand sides) 9 Number of matrix types (list types on next line if 0 < NTYPES < 9) 1 2 3 4 5 6 7 8 9 Matrix Types -42.0 Threshold value of test ratio +45.0 Threshold value of test ratio T Put T to test the error exits From b1737698db5773ffde6a3a6c8586da4bfb991099 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 13 Aug 2024 07:01:21 -0500 Subject: [PATCH 6/8] Fix DEFAULTS in SBGEMM for POWER10. Also comparisons for SBGEMM unit test can be exactly due to epilison differences. --- param.h | 4 +-- test/compare_sgemm_sbgemm.c | 68 +++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/param.h b/param.h index 2618e1f60..0e4d8965d 100644 --- a/param.h +++ b/param.h @@ -2637,8 +2637,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef SBGEMM_DEFAULT_Q #define SBGEMM_DEFAULT_UNROLL_M 16 #define SBGEMM_DEFAULT_UNROLL_N 8 -#define SBGEMM_DEFAULT_P 832 -#define SBGEMM_DEFAULT_Q 1026 +#define SBGEMM_DEFAULT_P 512 +#define SBGEMM_DEFAULT_Q 1024 #define SBGEMM_DEFAULT_R 4096 #endif diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index de589458b..4b546fb1f 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,6 +81,8 @@ float16to32 (bfloat16_bits f16) return f32.v; } +#define SBGEMM_LARGEST 256 + int main (int argc, char *argv[]) { @@ -88,12 +90,39 @@ main (int argc, char *argv[]) int i, j, l; blasint x, y; int ret = 0; - int loop = 100; + int loop = SBGEMM_LARGEST; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; for (x = 0; x <= loop; x++) { + if ((x > 100) && (x != SBGEMM_LARGEST)) continue; + m = k = n = x; + float *A = (float *)malloc(m * k * sizeof(FLOAT)); + float *B = (float *)malloc(k * n * sizeof(FLOAT)); + float *C = (float *)malloc(m * n * sizeof(FLOAT)); + bfloat16_bits *AA = (bfloat16_bits *)malloc(m * k * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc(k * n * sizeof(bfloat16_bits)); + float *DD = (float *)malloc(m * n * sizeof(FLOAT)); + float *CC = (float *)malloc(m * n * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + bfloat16 atmp,btmp; + blasint one=1; + + for (j = 0; j < m; j++) + { + for (i = 0; i < n; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; + } + } for (y = 0; y < 4; y++) { if ((y == 0) || (y == 2)) { @@ -106,34 +135,16 @@ main (int argc, char *argv[]) } else { transB = 'T'; } - m = k = n = x; - float A[m * k]; - float B[k * n]; - float C[m * n]; - bfloat16_bits AA[m * k], BB[k * n]; - float DD[m * n], CC[m * n]; - bfloat16 atmp,btmp; - blasint one=1; - for (j = 0; j < m; j++) - { - for (i = 0; i < m; i++) - { - A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - C[j * k + i] = 0; - sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); - sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); - AA[j * k + i].v = atmp; - BB[j * k + i].v = btmp; - CC[j * k + i] = 0; - DD[j * k + i] = 0; - } - } + memset(CC, 0, m * n * sizeof(FLOAT)); + memset(DD, 0, m * n * sizeof(FLOAT)); + memset(C, 0, m * n * sizeof(FLOAT)); + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, &m, (bfloat16*)BB, &k, &beta, CC, &m); + for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) @@ -160,9 +171,16 @@ main (int argc, char *argv[]) } for (i = 0; i < n; i++) for (j = 0; j < m; j++) - if (CC[i * m + j] != DD[i * m + j]) + if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0) ret++; } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); } if (ret != 0) From 20bdb658828e62a01dcc0b97edf14cb56f3ea6a8 Mon Sep 17 00:00:00 2001 From: Henry Chen Date: Mon, 12 Aug 2024 16:22:31 +0800 Subject: [PATCH 7/8] Fix recursive variable expansion in Makefiles for LOONGSON3A --- ctest/Makefile | 2 +- test/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index c02e04e1a..877a190c1 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -26,7 +26,7 @@ endif override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) - override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 + override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0 endif override FFLAGS += -fno-tree-vectorize endif diff --git a/test/Makefile b/test/Makefile index cfb2d41f5..65576d3dd 100644 --- a/test/Makefile +++ b/test/Makefile @@ -2,7 +2,7 @@ TOPDIR = .. include ../Makefile.system ifeq ($(F_COMPILER),GFORTRAN) ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) - override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 + override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0 endif override FFLAGS += -fno-tree-vectorize endif From 31226740d6f12c39e3f7ac3d3eb1475180121b5e Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 14 Aug 2024 08:10:25 -0500 Subject: [PATCH 8/8] Cleanup of SBGEMM unit test. --- test/compare_sgemm_sbgemm.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 4b546fb1f..395317441 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -113,13 +113,19 @@ main (int argc, char *argv[]) for (j = 0; j < m; j++) { - for (i = 0; i < n; i++) + for (i = 0; i < k; i++) { A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); - sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); AA[j * k + i].v = atmp; + } + } + for (j = 0; j < n; j++) + { + for (i = 0; i < k; i++) + { + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); BB[j * k + i].v = btmp; } } @@ -147,10 +153,7 @@ main (int argc, char *argv[]) for (i = 0; i < n; i++) for (j = 0; j < m; j++) - if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) - ret++; - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) + { for (l = 0; l < k; l++) if (transA == 'N' && transB == 'N') { @@ -169,10 +172,11 @@ main (int argc, char *argv[]) DD[i * m + j] += float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); } - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0) ret++; + } } free(A); free(B);