From 39724e8128cee3ab49aaa1f508e97bf9f56db61e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jul 2020 01:14:08 +0200 Subject: [PATCH 1/8] Separate OpenMP handling and allow compilation of Power9 code with older gcc --- Makefile.power | 54 ++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/Makefile.power b/Makefile.power index c1556fe82..37a02d692 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,54 +10,36 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) -ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp -else COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif -endif ifeq ($(CORE), POWER9) -ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -mcpu=power8 -mtune=power8 else -CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp +CCOMMON_OPT += -mcpu=power9 -mtune=power9 endif -ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp -else -FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp -endif -else -ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +FCOMMON_OPT += -mcpu=power8 -mtune=power8 +else +FCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif else FCOMMON_OPT += -O2 -Mrecursive endif endif -endif ifeq ($(CORE), POWER8) -ifeq ($(USE_OPENMP), 1) -ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -else -CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp -endif -ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp -else -FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp -endif -else ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else @@ -73,6 +55,18 @@ else FCOMMON_OPT += -O2 -Mrecursive endif endif + +ifeq ($(USE_OPENMP), 1) +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +CCOMMON_OPT += -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) +FCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +FCOMMON_OPT += -DUSE_OPENMP -mp +endif endif # workaround for C->FORTRAN ABI violation in LAPACKE From f77b6a83f4c20ca4e4769a999a69b0f47f7f4bb1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 29 Jul 2020 18:59:32 -0500 Subject: [PATCH 2/8] dgemv optimization for POWER10 Making use of new vector pair POWER10 instructions in dgemv_n and dgemv_t. Also adding a new block 4x128 to make use of Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. Tested on simulator and there are no new test failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/dgemv_n_microk_power10.c | 268 ++++++++ kernel/power/dgemv_n_power10.c | 565 +++++++++++++++++ kernel/power/dgemv_t_power10.c | 840 ++++++++++++++++++++++++++ 4 files changed, 1675 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dgemv_n_microk_power10.c create mode 100644 kernel/power/dgemv_n_power10.c create mode 100644 kernel/power/dgemv_t_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 39f5e9414..f390fac61 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -187,12 +187,12 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c -DGEMVTKERNEL = dgemv_t.c +DGEMVTKERNEL = dgemv_t_power10.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c new file mode 100644 index 000000000..4be8a5f9b --- /dev/null +++ b/kernel/power/dgemv_n_microk_power10.c @@ -0,0 +1,268 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_4x4 1 + +static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + double *a0; + double *a1; + double *a2; + double *a3; + + __asm__ + ( + "lxvp 40, 0(%10) \n\t" // x0, x1 + XXSPLTD_S(32,%x9,0) // alpha, alpha + + "sldi %6, %13, 3 \n\t" // lda * sizeof (double) + + "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha + + "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda + "add %6, %6, %6 \n\t" // 2 * lda + + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + + "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda + + "dcbt 0, %3 \n\t" + "dcbt 0, %4 \n\t" + "dcbt 0, %5 \n\t" + "dcbt 0, %6 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "dcbt 0, %2 \n\t" + + "addi %3, %3, 32 \n\t" + "addi %4, %4, 32 \n\t" + "addi %5, %5, 32 \n\t" + "addi %6, %6, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n" + "#a0=%3 a1=%4 a2=%5 a3=%6" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3) // 6 + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 9 + "r" (x), // 10 + "b" (16), // 11 + "3" (ap), // 12 + "4" (lda) // 13 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c new file mode 100644 index 000000000..ad5f1ba0d --- /dev/null +++ b/kernel/power/dgemv_n_power10.c @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef __vector_pair __attribute__((aligned(8))) vecp_t; + +#include "dgemv_n_microk_power10.c" + +#define MMA(X, APTR, ACC) \ + rX = (vec_t *) & X; \ + rowA = *((vecp_t*)((void*)&APTR)); \ + __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); + +#define SAVE(ACC, Z) \ + rowC = (v4sf_t *) &y[Z]; \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][1] = result[1][0]; \ + result[2][1] = result[3][0]; \ + rowC[0] += valpha * result[0]; \ + rowC[1] += valpha * result[2]; + +void +dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, + FLOAT * y, FLOAT alpha) +{ + BLASLONG i, j, tmp; + FLOAT *a0 = a_ptr; + FLOAT *x1 = xo; + vector double valpha = { alpha, alpha }; + v4sf_t *rowC; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + v4sf_t result[4]; + vecp_t rowA; + vec_t *rX; + tmp = (n / 32) * 32; + for (i = 0; i < tmp; i += 32) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i + 0); + SAVE (&acc1, i + 4); + SAVE (&acc2, i + 8); + SAVE (&acc3, i + 12); + SAVE (&acc4, i + 16); + SAVE (&acc5, i + 20); + SAVE (&acc6, i + 24); + SAVE (&acc7, i + 28); + + } + for (i = tmp; i < n; i += 4) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i); + } +} + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + FLOAT *a0 = a_ptr; + FLOAT *a1 = a0 + lda; + FLOAT *a2 = a1 + lda; + FLOAT *a3 = a2 + lda; + + + for ( i=0; i<4; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<2; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<1; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 7; + n1 = (n - (n128 * 128)) >> 2; + n2 = (n - (n128 * 128)) & 3; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + for( i = 0; i < n128 ; i++) + { + dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda128; + x_ptr += 128; + } + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + for( i = 0; i < n128 ; i++) + { + FLOAT xbuffer[128] __attribute__ ((aligned (16))); + BLASLONG j; + for ( j = 0; j < 128 ; j++) + { + xbuffer[j] = x_ptr[0]; + x_ptr += inc_x; + } + dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda128; + } + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c new file mode 100644 index 000000000..3db4d5785 --- /dev/null +++ b/kernel/power/dgemv_t_power10.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 1024 +//#define PREFETCH 1 +#include + +#define HAVE_KERNEL4x8_ASM 1 + + +#if defined(HAVE_KERNEL4x8_ASM) +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + BLASLONG off2; + BLASLONG tempR; + __asm__( + + "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 + "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) + "xxlxor 34,34,34 \n\t" + "xxlxor 35,34,34 \n\t" + "add %[a2], %[a0], %[temp] \n\t" + "add %[a1], %[a0], %[off] \n\t" + "xxlxor 4,34,34 \n\t" + "xxlxor 5,34,34 \n\t" + "xxlxor 6,34,34 \n\t" + "xxlxor 7,34,34 \n\t" + "add %[a3], %[a2], %[off] \n\t" + "add %[a4], %[a2], %[temp] \n\t" + + "xxlxor 8,34,34 \n\t" + "xxlxor 9,34,34 \n\t" + "add %[a5], %[a3], %[temp] \n\t" + "li %[off],0 \n\t" + "li %[off2],16 \n\t" + + "add %[a6], %[a4], %[temp] \n\t" + "add %[a7], %[a5], %[temp] \n\t" + + + + + "lxvp 32, 0(%[x]) \n\t" + "lxvp 36, 0(%[a0]) \n\t" + "lxvp 38, 0(%[a1]) \n\t" + "lxvp 40, 0(%[a2]) \n\t" + "lxvp 42, 0(%[a3]) \n\t" + "lxvp 44, 0(%[a4]) \n\t" + "lxvp 46, 0(%[a5]) \n\t" + "lxvp 48, 0(%[a6]) \n\t" + "lxvp 50, 0(%[a7]) \n\t" +#if defined(PREFETCH) + "li %[temp],896 \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + + "li %[off],32 \n\t" + + + "ble- two%= \n\t" + + //-------------------------------------------------- + ".align 5 \n\t" + "one%=: \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 32(%[a0]) \n\t" + "lxvp 38, 32(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 32(%[a2]) \n\t" + "lxvp 42, 32(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 32(%[a4]) \n\t" + "lxvp 46, 32(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 32(%[a6]) \n\t" + "lxvp 50, 32(%[a7]) \n\t" + "lxvp 32, 32(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 64(%[a0]) \n\t" + "lxvp 38, 64(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 64(%[a2]) \n\t" + "lxvp 42, 64(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 64(%[a4]) \n\t" + "lxvp 46, 64(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 64(%[a6]) \n\t" + "lxvp 50, 64(%[a7]) \n\t" + "lxvp 32, 64(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" +#if defined(PREFETCH) + "addi %[temp],%[temp],128 \n\t" +#endif + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a0] \n\t" +#endif + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 96(%[a0]) \n\t" + "lxvp 38, 96(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a1] \n\t" +#endif + "lxvp 40, 96(%[a2]) \n\t" + "lxvp 42, 96(%[a3]) \n\t" + "addi %[off], %[off],32 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 96(%[a4]) \n\t" + "lxvp 46, 96(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a3] \n\t" +#endif + "lxvp 48, 96(%[a6]) \n\t" + "lxvp 50, 96(%[a7]) \n\t" + "lxvp 32, 96(%[x]) \n\t" + + "addic. %[n],%[n],-4 \n\t" + "ble- two%= \n\t" + + "addi %[off2], %[off2],32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a2] \n\t" +#endif + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a4] \n\t" +#endif + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a5] \n\t" +#endif + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 128(%[a0]) \n\t" + "lxvp 38, 128(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 128(%[a2]) \n\t" + "lxvp 42, 128(%[a3]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a6] \n\t" +#endif + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 128(%[a4]) \n\t" + "lxvp 46, 128(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a7] \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 128(%[a6]) \n\t" + "lxvp 50, 128(%[a7]) \n\t" + "lxvp 32, 128(%[x]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[x] \n\t" +#endif + "addi %[a0], %[a0], 128 \n\t" + "addi %[a1], %[a1], 128 \n\t" + "addi %[a2], %[a2], 128 \n\t" + "addi %[a3], %[a3], 128 \n\t" + "addi %[a4], %[a4], 128 \n\t" + "addi %[a5], %[a5], 128 \n\t" + "addi %[a6], %[a6], 128 \n\t" + "addi %[a7], %[a7], 128 \n\t" + "addi %[x], %[x], 128 \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" + //-------------------------------------------- + + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + XXSPLTD_S(36,%x[alpha],0) + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "lxvp 38, 0(%[y]) \n\t" + "lxvp 40, 32(%[y]) \n\t" + + + + XXMRGLD_S(42,35,34) + XXMRGHD_S(43,35,34) + + XXMRGLD_S(44,5,4) + XXMRGHD_S(45,5,4) + + "xvadddp 42,42,43 \n\t" + + XXMRGLD_S(46,7,6) + XXMRGHD_S(47,7,6) + + "xvadddp 44,44,45 \n\t" + + XXMRGLD_S(48,9,8) + XXMRGHD_S(49,9,8) + + "xvadddp 46,46,47 \n\t" + + "xvmaddadp 39,42,36 \n\t" + "xvmaddadp 38,44,36 \n\t" + + "xvadddp 48,48,49 \n\t" + + "xvmaddadp 41,46,36 \n\t" + + "stxvp 38, 0(%[y]) \n\t" + "xvmaddadp 40,48,36 \n\t" + "stxvp 40, 32(%[y]) \n\t" + + : [memy] "+m" (*(double (*)[8])y), + [n] "+&r" (n), + [a0] "=b" (a0), + [a1] "=&b" (a1), + [a2] "=&b" (a2), + [a3] "=&b" (a3), + [a4] "=&b" (a4), + [a5] "=&b" (a5), + [a6] "=&b" (a6), + [a7] "=&b" (a7), + [off] "+&b" (lda), + [off2]"=&b" (off2), + [temp] "=&b" (tempR) + : [memx] "m" (*(const double (*)[n])x), + [mem_ap] "m" (*(const double (*)[n*8]) ap), + [alpha] "d" (alpha), + "[a0]" (ap), + [x] "b" (x), + [y] "b" (y) + : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", + "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + return; +} +#else +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; +#if defined(PREFETCH) + BLASLONG j, c, k; +#endif + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector double*) a0; + va1 = (__vector double*) a1; + va2 = (__vector double*) a2; + va3 = (__vector double*) a3; + va4 = (__vector double*) a4; + va5 = (__vector double*) a5; + va6 = (__vector double*) a6; + va7 = (__vector double*) a7; + v_x = (__vector double*) x; + +#if defined(PREFETCH) + + c = n >> 1; + + for (j = 0; j < c; j += 64) { + k = (c - j) > 64 ? 64 : (c - j); + __builtin_prefetch(v_x + 64); + __builtin_prefetch(va0 + 64); + __builtin_prefetch(va1 + 64); + __builtin_prefetch(va2 + 64); + __builtin_prefetch(va3 + 64); + __builtin_prefetch(va4 + 64); + __builtin_prefetch(va5 + 64); + __builtin_prefetch(va6 + 64); + __builtin_prefetch(va7 + 64); + for (i = 0; i < k; i += 2) { +#else + + for (i = 0; i < n/2; i += 2) { +#endif + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + temp0 += v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i + 1] * va1[i + 1]; + temp2 += v_x[i + 1] * va2[i + 1]; + temp3 += v_x[i + 1] * va3[i + 1]; + + temp4 += v_x[i + 1] * va4[i + 1]; + temp5 += v_x[i + 1] * va5[i + 1]; + temp6 += v_x[i + 1] * va6[i + 1]; + temp7 += v_x[i + 1] * va7[i + 1]; + } +#if defined(PREFETCH) + va0 += 64; + va1 += 64; + va2 += 64; + va3 += 64; + va4 += 64; + va5 += 64; + va6 += 64; + va7 += 64; + v_x += 64; + + } +#endif + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + + y[4] += alpha * (temp4[0] + temp4[1]); + y[5] += alpha * (temp5[0] + temp5[1]); + y[6] += alpha * (temp6[0] + temp6[1]); + y[7] += alpha * (temp7[0] + temp7[1]); + +} + +#endif + + +static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* va2 = (__vector double*) a2; + __vector double* va3 = (__vector double*) a3; + __vector double* v_x = (__vector double*) x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i + 1] * va0[i + 1]; + temp5 += v_x[i + 1] * va1[i + 1]; + temp6 += v_x[i + 1] * va2[i + 1]; + temp7 += v_x[i + 1] * va3[i + 1]; + } + + temp0 += temp4; + temp1 += temp5; + temp2 += temp6; + temp3 += temp7; + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + +} + + +static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + __vector double temp1 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]); + y[inc_y] += alpha * (temp1[0] + temp1[1]); +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector double* va0 = (__vector double*) a0; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + } + + *y += alpha * (temp0[0] + temp0[1]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; +#if defined(PREFETCH) + __builtin_prefetch(y_ptr+64); +#endif + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + From 104aa678b0f4bc4dd9f65959d0b6f1aeb7b6f6d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jul 2020 11:40:52 +0200 Subject: [PATCH 3/8] Fix inadvertent version number reversal to 0.3.9.dev caused by #2710 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e51e7e38..4bef6570c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 589c74aed38bb7923d6653fa9370b81e4fe95b4a Mon Sep 17 00:00:00 2001 From: Kevin Adler Date: Thu, 30 Jul 2020 20:52:16 -0500 Subject: [PATCH 4/8] Use systemcfg APIs for CPU detection on AIX AIX libc already provides ready access to an integer that contains a bit identifying the CPU it's running on, so there's no need to call a program and grep its output. Additionally, prtconf is not available in the PASE runtime, which provides an AIX emulation layer on the IBM i operating system. The AIX systemcfg.h also provides macro definitions like POWER_8, POWER_9, etc for all the bits defining the CPUs as well as macros like __power_8(), __power_9_andup() that return booleans, but I did not use them. Since these macros depend on the level of the OS in which it is built, they may not be defined and instead the associated hex literals are used directly. --- cpuid_power.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index 8f578d68f..df3dc8668 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -38,6 +38,7 @@ #include #ifdef _AIX +#include #include #endif #ifdef __APPLE__ @@ -137,35 +138,19 @@ int detect(void){ #endif #ifdef _AIX - FILE *infile; - char buffer[512], *p; + // Cast from int to unsigned to ensure comparisons work for all bits in + // the bit mask, even the top bit + unsigned implementation = (unsigned) _system_configuration.implementation; - p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Pro", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - pclose(infile); - - if (strstr(p, "POWER3")) return CPUTYPE_POWER3; - if (strstr(p, "POWER4")) return CPUTYPE_POWER4; - if (strstr(p, "PPC970")) return CPUTYPE_PPC970; - if (strstr(p, "POWER5")) return CPUTYPE_POWER5; - if (strstr(p, "POWER6")) return CPUTYPE_POWER6; - if (strstr(p, "POWER7")) return CPUTYPE_POWER6; - if (strstr(p, "POWER8")) return CPUTYPE_POWER8; - if (strstr(p, "POWER9")) return CPUTYPE_POWER9; - if (strstr(p, "POWER10")) return CPUTYPE_POWER10; - if (strstr(p, "Cell")) return CPUTYPE_CELL; - if (strstr(p, "7447")) return CPUTYPE_PPCG4; - return CPUTYPE_POWER5; + if (implementation >= 0x40000u) return CPUTYPE_POWER10; + else if (implementation & 0x20000) return CPUTYPE_POWER9; + else if (implementation & 0x10000) return CPUTYPE_POWER8; + else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 + else if (implementation & 0x04000) return CPUTYPE_POWER6; + else if (implementation & 0x02000) return CPUTYPE_POWER5; + else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 + else if (implementation & 0x00800) return CPUTYPE_POWER4; + else return CPUTYPE_POWER3; #endif #ifdef __APPLE__ From da9e2a7adafc2e0d321e6f2f90beaffed2853372 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Jul 2020 16:03:33 +0200 Subject: [PATCH 5/8] Add SYMBOLPREFIX and/or SYMBOLSUFFIX to cblas prototypes --- Makefile | 3 ++- Makefile.install | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index e113026dd..c1d943fac 100644 --- a/Makefile +++ b/Makefile @@ -365,11 +365,12 @@ clean :: @$(MAKE) -C kernel clean #endif @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 ifeq ($(OSNAME), Darwin) @rm -rf getarch.dSYM getarch_2nd.dSYM endif @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f cblas.tmp cblas.tmp2 @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h diff --git a/Makefile.install b/Makefile.install index dad869f4c..12713a6db 100644 --- a/Makefile.install +++ b/Makefile.install @@ -45,7 +45,16 @@ install : lib.grd ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" + @cp cblas.h cblas.tmp +ifdef SYMBOLPREFIX + @sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp +endif +ifdef SYMBOLSUFFIX + @sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2 + @sed 's/(void)/$(SYMBOLSUFFIX)(void)/g' cblas.tmp2 > cblas.tmp +endif + @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifneq ($(OSNAME), AIX) @@ -168,4 +177,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - From 60cd5e55fc2b8d50b52ebc54c701cb7315ad74ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Aug 2020 12:31:39 +0200 Subject: [PATCH 6/8] Protect against inadvertent activation of USE_CUDA --- driver/others/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/Makefile b/driver/others/Makefile index 5653f3c25..7558ec058 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -47,8 +47,10 @@ endif endif ifdef USE_CUDA +ifeq ($(USE_CUDA), 1) COMMONOBJS += cuda_init.$(SUFFIX) endif +endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) From ecf4b9e0fca35ed15e3b0354002584fbd29a6166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Aug 2020 17:06:03 +0200 Subject: [PATCH 7/8] Improve substitution rules for SYMBOLPREFIX and -SUFFIX addition --- Makefile.install | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Makefile.install b/Makefile.install index 12713a6db..01c0b1226 100644 --- a/Makefile.install +++ b/Makefile.install @@ -47,12 +47,18 @@ ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @cp cblas.h cblas.tmp ifdef SYMBOLPREFIX - @sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2 - @sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp endif ifdef SYMBOLSUFFIX - @sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2 - @sed 's/(void)/$(SYMBOLSUFFIX)(void)/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp endif @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif From 6794ac34153d9def9a1056738090160868417702 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 11:20:08 +0200 Subject: [PATCH 8/8] Add SYMBOLPREFIX and/or -SUFFIX to cblas.h if needed --- CMakeLists.txt | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e51e7e38..c324e2241 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -249,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) endif() endif() -if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() @@ -358,10 +358,21 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif()