From 39724e8128cee3ab49aaa1f508e97bf9f56db61e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 30 Jul 2020 01:14:08 +0200
Subject: [PATCH 1/8] Separate OpenMP handling and allow compilation of Power9
 code with older gcc

---
 Makefile.power | 54 ++++++++++++++++++++++----------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index c1556fe82..37a02d692 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -10,54 +10,36 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
-ifeq ($(USE_OPENMP), 1)
-COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx  -DUSE_OPENMP -fno-fast-math -fopenmp
-FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -DUSE_OPENMP -fno-fast-math -fopenmp
-else
 COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx  -fno-fast-math
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
-endif
 
 ifeq ($(CORE), POWER9)
-ifeq ($(USE_OPENMP), 1)
 ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx  -DUSE_OPENMP -fno-fast-math -fopenmp
+CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
+ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -mcpu=power8 -mtune=power8 
 else
-CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
+CCOMMON_OPT += -mcpu=power9 -mtune=power9 
 endif
-ifneq ($(F_COMPILER), PGI)
-FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9  -DUSE_OPENMP -fno-fast-math -fopenmp
-else
-FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
-endif
-else
-ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx  -fno-fast-math
 else
 CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
-FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9  -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -fno-fast-math
+ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+FCOMMON_OPT += -mcpu=power8 -mtune=power8 
+else
+FCOMMON_OPT += -mcpu=power9 -mtune=power9 
+endif
 else
 FCOMMON_OPT += -O2 -Mrecursive
 endif
 endif
-endif
 
 ifeq ($(CORE), POWER8)
-ifeq ($(USE_OPENMP), 1)
-ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx  -DUSE_OPENMP -fno-fast-math -fopenmp
-else
-CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
-endif
-ifneq ($(F_COMPILER), PGI)
-FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -DUSE_OPENMP -fno-fast-math -fopenmp
-else
-FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
-endif
-else
 ifneq ($(C_COMPILER), PGI)
 CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx  -fno-fast-math
 else
@@ -73,6 +55,18 @@ else
 FCOMMON_OPT += -O2 -Mrecursive
 endif
 endif
+
+ifeq ($(USE_OPENMP), 1)
+ifneq ($(C_COMPILER), PGI)
+CCOMMON_OPT += -DUSE_OPENMP -fopenmp
+else
+CCOMMON_OPT += -DUSE_OPENMP -mp
+endif
+ifneq ($(F_COMPILER), PGI)
+FCOMMON_OPT += -DUSE_OPENMP -fopenmp
+else
+FCOMMON_OPT += -DUSE_OPENMP -mp
+endif
 endif
 
 # workaround for C->FORTRAN ABI violation in LAPACKE

From f77b6a83f4c20ca4e4769a999a69b0f47f7f4bb1 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Wed, 29 Jul 2020 18:59:32 -0500
Subject: [PATCH 2/8] dgemv optimization for POWER10

Making use of new vector pair POWER10 instructions in dgemv_n and dgemv_t.
Also adding a new block 4x128 to make use of Matrix-Multiply Assist (MMA)
feature introduced in POWER ISA v3.1.  Tested on simulator and there
are no new test failures.
---
 kernel/power/KERNEL.POWER10           |   4 +-
 kernel/power/dgemv_n_microk_power10.c | 268 ++++++++
 kernel/power/dgemv_n_power10.c        | 565 +++++++++++++++++
 kernel/power/dgemv_t_power10.c        | 840 ++++++++++++++++++++++++++
 4 files changed, 1675 insertions(+), 2 deletions(-)
 create mode 100644 kernel/power/dgemv_n_microk_power10.c
 create mode 100644 kernel/power/dgemv_n_power10.c
 create mode 100644 kernel/power/dgemv_t_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 39f5e9414..f390fac61 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -187,12 +187,12 @@ ZSWAPKERNEL  = zswap.c
 #
 
 SGEMVNKERNEL = sgemv_n.c
-DGEMVNKERNEL = dgemv_n.c
+DGEMVNKERNEL = dgemv_n_power10.c
 CGEMVNKERNEL = cgemv_n.c
 ZGEMVNKERNEL = zgemv_n_4.c
 #
 SGEMVTKERNEL = sgemv_t.c
-DGEMVTKERNEL = dgemv_t.c
+DGEMVTKERNEL = dgemv_t_power10.c
 CGEMVTKERNEL = cgemv_t.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c
new file mode 100644
index 000000000..4be8a5f9b
--- /dev/null
+++ b/kernel/power/dgemv_n_microk_power10.c
@@ -0,0 +1,268 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+
+static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
+{
+  double *a0;
+  double *a1;
+  double *a2;
+  double *a3;
+
+  __asm__
+    (
+       "lxvp	40, 0(%10)	\n\t"	// x0, x1
+       XXSPLTD_S(32,%x9,0)	// alpha, alpha
+
+       "sldi		%6, %13, 3	\n\t"	// lda * sizeof (double)
+
+       "xvmuldp		34, 41, 32	\n\t"	// x0 * alpha, x1 * alpha
+       "xvmuldp		35, 40, 32	\n\t"	// x2 * alpha, x3 * alpha
+
+       "add		%4, %3, %6	\n\t"	// a0 = ap, a1 = a0 + lda
+       "add		%6, %6, %6	\n\t"	// 2 * lda
+
+       XXSPLTD_S(32,34,1)	// x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,0)	// x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,1)	// x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,0)	// x3 * alpha, x3 * alpha
+
+       "add		%5, %3, %6	\n\t"	// a2 = a0 + 2 * lda
+       "add		%6, %4, %6	\n\t"	// a3 = a1 + 2 * lda
+
+       "dcbt		0, %3		\n\t"
+       "dcbt		0, %4		\n\t"
+       "dcbt		0, %5		\n\t"
+       "dcbt		0, %6		\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "dcbt		0, %2		\n\t"
+
+       "addi		%3, %3, 32	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "addi		%6, %6, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp	46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+     "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
+     "#a0=%3 a1=%4 a2=%5 a3=%6"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (y),	// 2
+       "=b" (a0),	// 3
+       "=b" (a1),	// 4
+       "=&b" (a2),	// 5
+       "=&b" (a3)	// 6
+     :
+       "m" (*x),
+       "m" (*ap),
+       "d" (alpha),	// 9
+       "r" (x),		// 10
+       "b" (16),	// 11
+       "3" (ap),	// 12
+       "4" (lda)	// 13
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c
new file mode 100644
index 000000000..ad5f1ba0d
--- /dev/null
+++ b/kernel/power/dgemv_n_power10.c
@@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef __vector_pair          __attribute__((aligned(8))) vecp_t;
+
+#include "dgemv_n_microk_power10.c"
+
+#define MMA(X, APTR, ACC) \
+        rX = (vec_t *) & X; \
+        rowA = *((vecp_t*)((void*)&APTR)); \
+        __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
+
+#define SAVE(ACC, Z) \
+        rowC = (v4sf_t *) &y[Z]; \
+        __builtin_mma_disassemble_acc ((void *)result, ACC); \
+        result[0][1] = result[1][0]; \
+        result[2][1] = result[3][0]; \
+        rowC[0] += valpha * result[0]; \
+        rowC[1] += valpha * result[2];
+
+void
+dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
+                    FLOAT * y, FLOAT alpha)
+{
+  BLASLONG i, j, tmp;
+  FLOAT *a0 = a_ptr;
+  FLOAT *x1 = xo;
+  vector double valpha = { alpha, alpha };
+  v4sf_t *rowC;
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  vecp_t rowA;
+  vec_t *rX;
+  tmp = (n / 32) * 32;
+  for (i = 0; i < tmp; i += 32)
+    {
+      xo = x1;
+      a0 = a_ptr;
+      __builtin_mma_xxsetaccz (&acc0);
+      __builtin_mma_xxsetaccz (&acc1);
+      __builtin_mma_xxsetaccz (&acc2);
+      __builtin_mma_xxsetaccz (&acc3);
+      __builtin_mma_xxsetaccz (&acc4);
+      __builtin_mma_xxsetaccz (&acc5);
+      __builtin_mma_xxsetaccz (&acc6);
+      __builtin_mma_xxsetaccz (&acc7);
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      SAVE (&acc0, i + 0);
+      SAVE (&acc1, i + 4);
+      SAVE (&acc2, i + 8);
+      SAVE (&acc3, i + 12);
+      SAVE (&acc4, i + 16);
+      SAVE (&acc5, i + 20);
+      SAVE (&acc6, i + 24);
+      SAVE (&acc7, i + 28);
+
+    }
+  for (i = tmp; i < n; i += 4)
+    {
+      xo = x1;
+      a0 = a_ptr;
+      __builtin_mma_xxsetaccz (&acc0);
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      SAVE (&acc0, i);
+    }
+}
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG i;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+	FLOAT *a0 = a_ptr;
+	FLOAT *a1 = a0 + lda;
+	FLOAT *a2 = a1 + lda;
+	FLOAT *a3 = a2 + lda;
+
+
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG i;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+
+	for ( i=0; i<2; i++)
+		x[i] = xo[i] * alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];		
+	}
+}
+
+
+#endif
+
+#ifndef HAVE_KERNEL_4x1
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG i;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+
+	for ( i=0; i<1; i++)
+		x[i] = xo[i] * alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0];		
+		y[i+1] += a0[i+1]*x[0];		
+		y[i+2] += a0[i+2]*x[0];		
+		y[i+3] += a0[i+3]*x[0];		
+	}
+}
+
+
+#endif
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda128 = lda << 7;
+
+	FLOAT xbuffer[8] __attribute__ ((aligned (16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	BLASLONG n128 = n >> 7;
+	n1 = (n - (n128 * 128)) >> 2;
+	n2 = (n - (n128 * 128)) & 3;
+
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*8);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+			for( i = 0; i < n128 ; i++)
+			{
+				dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda128;
+				x_ptr += 128;
+			}
+
+			for( i = 0; i < n1 ; i++)
+			{
+				dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha);
+				a_ptr += lda;
+				x_ptr += 1;	
+
+			}
+
+
+		}
+		else
+		{
+			for( i = 0; i < n128 ; i++)
+			{
+	                        FLOAT xbuffer[128] __attribute__ ((aligned (16)));
+				BLASLONG j;
+				for ( j = 0; j < 128 ; j++)
+				{
+					xbuffer[j] = x_ptr[0];
+				        x_ptr += inc_x;
+				}
+				dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+				a_ptr += lda128;
+			}
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c
new file mode 100644
index 000000000..3db4d5785
--- /dev/null
+++ b/kernel/power/dgemv_t_power10.c
@@ -0,0 +1,840 @@
+/***************************************************************************
+Copyright (c) 2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 1024
+//#define PREFETCH 1
+#include <altivec.h> 
+
+#define HAVE_KERNEL4x8_ASM 1
+
+
+#if defined(HAVE_KERNEL4x8_ASM)
+static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
+
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    BLASLONG off2;
+    BLASLONG tempR;
+    __asm__(
+         
+            "sldi   %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
+            "sldi   %[off], %[off], 3 \n\t" // lda * sizeof (double)    
+            "xxlxor 34,34,34  \n\t"
+            "xxlxor 35,34,34  \n\t"
+            "add    %[a2], %[a0], %[temp]    \n\t"
+            "add    %[a1], %[a0], %[off]     \n\t"
+            "xxlxor 4,34,34 \n\t"
+            "xxlxor 5,34,34 \n\t"
+            "xxlxor 6,34,34 \n\t"
+            "xxlxor 7,34,34 \n\t"            
+            "add    %[a3], %[a2], %[off]     \n\t"
+            "add    %[a4], %[a2], %[temp]    \n\t"            
+ 
+            "xxlxor 8,34,34 \n\t"
+            "xxlxor 9,34,34 \n\t"              
+            "add    %[a5], %[a3], %[temp]    \n\t"
+            "li     %[off],0    \n\t"
+            "li     %[off2],16  \n\t" 
+  
+            "add    %[a6], %[a4], %[temp]    \n\t" 
+            "add    %[a7], %[a5], %[temp]    \n\t"
+    
+
+
+
+            "lxvp 32, 0(%[x])   \n\t"
+            "lxvp 36, 0(%[a0])  \n\t"
+            "lxvp 38, 0(%[a1])  \n\t"
+            "lxvp 40, 0(%[a2])  \n\t"
+            "lxvp 42, 0(%[a3])  \n\t"
+            "lxvp 44, 0(%[a4])  \n\t"
+            "lxvp 46, 0(%[a5])  \n\t"
+            "lxvp 48, 0(%[a6])  \n\t"
+            "lxvp 50, 0(%[a7])  \n\t"
+#if defined(PREFETCH)    
+            "li     %[temp],896  \n\t"
+#endif    
+            "addic. %[n],%[n],-4 \n\t"
+
+            "li       %[off],32 \n\t" 
+
+
+            "ble-     two%=      \n\t"
+
+            //--------------------------------------------------           
+            ".align   5           \n\t"
+            "one%=:                     \n\t"
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "addi   %[off2],  %[off2],32 \n\t"
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 32(%[a0])  \n\t"
+            "lxvp 38, 32(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "addi       %[off],  %[off],32 \n\t"
+            "lxvp 40, 32(%[a2])  \n\t"
+            "lxvp 42, 32(%[a3])  \n\t"
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 32(%[a4])  \n\t"
+            "lxvp 46, 32(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+            
+            "addic. %[n],%[n],-4 \n\t"                        
+            "lxvp 48, 32(%[a6])  \n\t"
+            "lxvp 50, 32(%[a7])  \n\t"
+            "lxvp 32, 32(%[x])   \n\t"
+            "ble- two%=  \n\t"
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "addi   %[off2],  %[off2],32 \n\t" 
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 64(%[a0])  \n\t"
+            "lxvp 38, 64(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "addi       %[off],  %[off],32 \n\t"
+            "lxvp 40, 64(%[a2])  \n\t"
+            "lxvp 42, 64(%[a3])  \n\t"
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 64(%[a4])  \n\t"
+            "lxvp 46, 64(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+            
+            "addic. %[n],%[n],-4 \n\t"                        
+            "lxvp 48, 64(%[a6])  \n\t"
+            "lxvp 50, 64(%[a7])  \n\t"
+            "lxvp 32, 64(%[x])   \n\t"
+            "ble- two%=  \n\t"
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+#if defined(PREFETCH)            
+            "addi    %[temp],%[temp],128 \n\t"   
+#endif                                             
+            "addi   %[off2],  %[off2],32 \n\t" 
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a0]  \n\t"
+#endif            
+
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 96(%[a0])  \n\t"
+            "lxvp 38, 96(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a1]  \n\t"
+#endif            
+            "lxvp 40, 96(%[a2])  \n\t"
+            "lxvp 42, 96(%[a3])  \n\t"
+            "addi       %[off],  %[off],32 \n\t"
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 96(%[a4])  \n\t"
+            "lxvp 46, 96(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a3]  \n\t"
+#endif            
+            "lxvp 48, 96(%[a6])  \n\t"
+            "lxvp 50, 96(%[a7])  \n\t"
+            "lxvp 32, 96(%[x])   \n\t"
+           
+            "addic. %[n],%[n],-4 \n\t"                        
+            "ble- two%=  \n\t"            
+ 
+            "addi   %[off2],  %[off2],32 \n\t" 
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a2]  \n\t"
+#endif            
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a4]  \n\t"                         
+#endif            
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+
+#if defined(PREFETCH)
+          "dcbt   %[temp],%[a5]  \n\t"
+#endif              
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 128(%[a0])  \n\t"
+            "lxvp 38, 128(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "addi       %[off],  %[off],32 \n\t"
+            "lxvp 40, 128(%[a2])  \n\t"
+            "lxvp 42, 128(%[a3])  \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a6]  \n\t"  
+#endif            
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 128(%[a4])  \n\t"
+            "lxvp 46, 128(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+            
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a7]  \n\t"  
+#endif            
+            "addic. %[n],%[n],-4 \n\t"
+            "lxvp 48, 128(%[a6])  \n\t"
+            "lxvp 50, 128(%[a7])  \n\t"
+            "lxvp 32, 128(%[x])   \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[x]  \n\t" 
+#endif            
+	    "addi    %[a0], %[a0], 128     \n\t"
+	    "addi    %[a1], %[a1], 128     \n\t"
+	    "addi    %[a2], %[a2], 128     \n\t"
+	    "addi    %[a3], %[a3], 128     \n\t"
+	    "addi    %[a4], %[a4], 128     \n\t"
+	    "addi    %[a5], %[a5], 128    \n\t"
+	    "addi    %[a6], %[a6], 128     \n\t"
+	    "addi    %[a7], %[a7], 128     \n\t"
+	    "addi    %[x], %[x], 128     \n\t"
+            "bgt+ one%=  \n\t"
+            ".align   5           \n\t"
+            "two%=: \n\t"
+            //--------------------------------------------
+
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t" 
+            XXSPLTD_S(36,%x[alpha],0)
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t"  
+
+            "lxvp 38, 0(%[y]) \n\t"
+            "lxvp 40, 32(%[y]) \n\t"
+
+ 
+
+            XXMRGLD_S(42,35,34)
+            XXMRGHD_S(43,35,34)
+
+            XXMRGLD_S(44,5,4)
+            XXMRGHD_S(45,5,4)
+
+            "xvadddp 42,42,43 \n\t"
+
+            XXMRGLD_S(46,7,6)
+            XXMRGHD_S(47,7,6)
+
+            "xvadddp 44,44,45 \n\t"
+
+            XXMRGLD_S(48,9,8)
+            XXMRGHD_S(49,9,8)
+
+            "xvadddp 46,46,47 \n\t"
+            
+            "xvmaddadp  39,42,36  \n\t"
+            "xvmaddadp  38,44,36  \n\t"
+            
+            "xvadddp 48,48,49 \n\t"
+
+            "xvmaddadp  41,46,36  \n\t"
+
+            "stxvp 38, 0(%[y]) \n\t"
+            "xvmaddadp  40,48,36  \n\t" 
+            "stxvp 40, 32(%[y])  \n\t"
+                 
+            : [memy] "+m" (*(double (*)[8])y),
+            [n] "+&r" (n),
+            [a0] "=b" (a0),
+            [a1] "=&b" (a1),
+            [a2] "=&b" (a2),
+            [a3] "=&b" (a3),
+            [a4] "=&b" (a4),
+            [a5] "=&b" (a5),
+            [a6] "=&b" (a6),
+            [a7] "=&b" (a7),            
+            [off] "+&b" (lda),
+            [off2]"=&b" (off2),
+            [temp] "=&b" (tempR)
+            : [memx] "m" (*(const double (*)[n])x),
+            [mem_ap] "m" (*(const double (*)[n*8]) ap),
+            [alpha] "d" (alpha),
+            "[a0]" (ap),
+            [x] "b" (x),
+            [y] "b" (y)
+            : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
+            "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
+            );
+    return;
+}
+#else
+static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;
+#if defined(PREFETCH)  
+    BLASLONG j, c, k;
+#endif    
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector double temp0 = {0, 0};
+    register __vector double temp1 = {0, 0};
+    register __vector double temp2 = {0, 0};
+    register __vector double temp3 = {0, 0};
+    register __vector double temp4 = {0, 0};
+    register __vector double temp5 = {0, 0};
+    register __vector double temp6 = {0, 0};
+    register __vector double temp7 = {0, 0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector double*) a0;
+    va1 = (__vector double*) a1;
+    va2 = (__vector double*) a2;
+    va3 = (__vector double*) a3;
+    va4 = (__vector double*) a4;
+    va5 = (__vector double*) a5;
+    va6 = (__vector double*) a6;
+    va7 = (__vector double*) a7;
+    v_x = (__vector double*) x;
+ 
+#if defined(PREFETCH)
+
+    c = n >> 1;
+
+    for (j = 0; j < c; j += 64) {
+        k = (c - j) > 64 ? 64 : (c - j);
+        __builtin_prefetch(v_x + 64);
+        __builtin_prefetch(va0 + 64);
+        __builtin_prefetch(va1 + 64);
+        __builtin_prefetch(va2 + 64);
+        __builtin_prefetch(va3 + 64);
+        __builtin_prefetch(va4 + 64);
+        __builtin_prefetch(va5 + 64);
+        __builtin_prefetch(va6 + 64);
+        __builtin_prefetch(va7 + 64); 
+         for (i = 0; i < k; i += 2) {
+#else
+        
+        for (i = 0; i < n/2; i += 2) {
+#endif
+            temp0 += v_x[i] * va0[i];
+            temp1 += v_x[i] * va1[i];
+            temp2 += v_x[i] * va2[i];
+            temp3 += v_x[i] * va3[i];
+            temp4 += v_x[i] * va4[i];
+            temp5 += v_x[i] * va5[i];
+            temp6 += v_x[i] * va6[i];
+            temp7 += v_x[i] * va7[i];
+            temp0 += v_x[i + 1] * va0[i + 1];
+            temp1 += v_x[i + 1] * va1[i + 1];
+            temp2 += v_x[i + 1] * va2[i + 1];
+            temp3 += v_x[i + 1] * va3[i + 1];
+
+            temp4 += v_x[i + 1] * va4[i + 1];
+            temp5 += v_x[i + 1] * va5[i + 1];
+            temp6 += v_x[i + 1] * va6[i + 1];
+            temp7 += v_x[i + 1] * va7[i + 1];
+        }
+#if defined(PREFETCH)
+        va0 += 64;
+        va1 += 64;
+        va2 += 64;
+        va3 += 64;
+        va4 += 64;
+        va5 += 64;
+        va6 += 64;
+        va7 += 64;
+        v_x += 64;
+
+    }
+#endif
+    y[0] += alpha * (temp0[0] + temp0[1]);
+    y[1] += alpha * (temp1[0] + temp1[1]);
+    y[2] += alpha * (temp2[0] + temp2[1]);
+    y[3] += alpha * (temp3[0] + temp3[1]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]);
+    y[5] += alpha * (temp5[0] + temp5[1]);
+    y[6] += alpha * (temp6[0] + temp6[1]);
+    y[7] += alpha * (temp7[0] + temp7[1]);
+
+}
+
+#endif
+ 
+
+static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector double* va0 = (__vector double*) a0;
+    __vector double* va1 = (__vector double*) a1;
+    __vector double* va2 = (__vector double*) a2;
+    __vector double* va3 = (__vector double*) a3;
+    __vector double* v_x = (__vector double*) x;
+    register __vector double temp0 = {0, 0};
+    register __vector double temp1 = {0, 0};
+    register __vector double temp2 = {0, 0};
+    register __vector double temp3 = {0, 0};
+    register __vector double temp4 = {0, 0};
+    register __vector double temp5 = {0, 0};
+    register __vector double temp6 = {0, 0};
+    register __vector double temp7 = {0, 0};
+
+    for (i = 0; i < n / 2; i += 2) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+        temp2 += v_x[i] * va2[i];
+        temp3 += v_x[i] * va3[i];
+        temp4 += v_x[i + 1] * va0[i + 1];
+        temp5 += v_x[i + 1] * va1[i + 1];
+        temp6 += v_x[i + 1] * va2[i + 1];
+        temp7 += v_x[i + 1] * va3[i + 1];
+    }
+
+    temp0 += temp4;
+    temp1 += temp5;
+    temp2 += temp6;
+    temp3 += temp7;
+    y[0] += alpha * (temp0[0] + temp0[1]);
+    y[1] += alpha * (temp1[0] + temp1[1]);
+    y[2] += alpha * (temp2[0] + temp2[1]);
+    y[3] += alpha * (temp3[0] + temp3[1]);
+
+}
+ 
+
+static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector double* va0 = (__vector double*) a0;
+    __vector double* va1 = (__vector double*) a1;
+    __vector double* v_x = (__vector double*) x;
+    __vector double temp0 = {0, 0};
+    __vector double temp1 = {0, 0};
+    for (i = 0; i < n / 2; i += 2) {
+        temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
+        temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1];
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]);
+}
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector double* va0 = (__vector double*) a0;
+    __vector double* v_x = (__vector double*) x;
+    __vector double temp0 = {0, 0};
+    for (i = 0; i < n / 2; i += 2) {
+        temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
+    }
+
+    *y += alpha * (temp0[0] + temp0[1]);
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+#if defined(PREFETCH)                
+               __builtin_prefetch(y_ptr+64);
+#endif               
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 == 3) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 3 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+                aj += 12;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                aj += 3;
+            }
+
+        } else {
+
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    aj += lda;
+                }
+
+            } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            }
+
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        }
+        return (0);
+
+    }
+
+    FLOAT xtemp = *x_ptr * alpha;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    }
+
+    return (0);
+
+}
+

From 104aa678b0f4bc4dd9f65959d0b6f1aeb7b6f6d3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 30 Jul 2020 11:40:52 +0200
Subject: [PATCH 3/8] Fix inadvertent version number reversal to 0.3.9.dev
 caused by #2710

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e51e7e38..4bef6570c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 9.dev)
+set(OpenBLAS_PATCH_VERSION 10.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 589c74aed38bb7923d6653fa9370b81e4fe95b4a Mon Sep 17 00:00:00 2001
From: Kevin Adler <kadler@us.ibm.com>
Date: Thu, 30 Jul 2020 20:52:16 -0500
Subject: [PATCH 4/8] Use systemcfg APIs for CPU detection on AIX

AIX libc already provides ready access to an integer that contains a bit
identifying the CPU it's running on, so there's no need to call a
program and grep its output. Additionally, prtconf is not available in
the PASE runtime, which provides an AIX emulation layer on the IBM i
operating system.

The AIX systemcfg.h also provides macro definitions like POWER_8,
POWER_9, etc for all the bits defining the CPUs as well as macros like
__power_8(), __power_9_andup() that return booleans, but I did not use
them. Since these macros depend on the level of the OS in which it is
built, they may not be defined and instead the associated hex literals
are used directly.
---
 cpuid_power.c | 41 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/cpuid_power.c b/cpuid_power.c
index 8f578d68f..df3dc8668 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -38,6 +38,7 @@
 
 #include  <sys/utsname.h>
 #ifdef _AIX
+#include <sys/systemcfg.h>
 #include <sys/vminfo.h>
 #endif
 #ifdef __APPLE__
@@ -137,35 +138,19 @@ int detect(void){
 #endif
 
 #ifdef _AIX
-  FILE *infile;
-  char buffer[512], *p;
+  // Cast from int to unsigned to ensure comparisons work for all bits in
+  // the bit mask, even the top bit
+  unsigned implementation = (unsigned) _system_configuration.implementation;
 
-  p = (char *)NULL;
-  infile = popen("prtconf|grep 'Processor Type'", "r");
-  while (fgets(buffer, sizeof(buffer), infile)){
-    if (!strncmp("Pro", buffer, 3)){
-	p = strchr(buffer, ':') + 2;
-#if 0
-	fprintf(stderr, "%s\n", p);
-#endif
-	break;
-      }
-  }
-
-  pclose(infile);
-
-  if (strstr(p, "POWER3")) return CPUTYPE_POWER3;
-  if (strstr(p, "POWER4")) return CPUTYPE_POWER4;
-  if (strstr(p, "PPC970")) return CPUTYPE_PPC970;
-  if (strstr(p, "POWER5")) return CPUTYPE_POWER5;
-  if (strstr(p, "POWER6")) return CPUTYPE_POWER6;
-  if (strstr(p, "POWER7")) return CPUTYPE_POWER6;
-  if (strstr(p, "POWER8")) return CPUTYPE_POWER8;
-  if (strstr(p, "POWER9")) return CPUTYPE_POWER9;
-  if (strstr(p, "POWER10")) return CPUTYPE_POWER10;
-  if (strstr(p, "Cell")) return CPUTYPE_CELL;
-  if (strstr(p, "7447")) return CPUTYPE_PPCG4;
-  return CPUTYPE_POWER5;
+  if (implementation >= 0x40000u) return CPUTYPE_POWER10;
+  else if (implementation & 0x20000) return CPUTYPE_POWER9;
+  else if (implementation & 0x10000) return CPUTYPE_POWER8;
+  else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7
+  else if (implementation & 0x04000) return CPUTYPE_POWER6;
+  else if (implementation & 0x02000) return CPUTYPE_POWER5;
+  else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450
+  else if (implementation & 0x00800) return CPUTYPE_POWER4;
+  else return CPUTYPE_POWER3;
 #endif
 
 #ifdef __APPLE__

From da9e2a7adafc2e0d321e6f2f90beaffed2853372 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 31 Jul 2020 16:03:33 +0200
Subject: [PATCH 5/8] Add SYMBOLPREFIX and/or SYMBOLSUFFIX to cblas prototypes

---
 Makefile         |  3 ++-
 Makefile.install | 12 ++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index e113026dd..c1d943fac 100644
--- a/Makefile
+++ b/Makefile
@@ -365,11 +365,12 @@ clean ::
 	@$(MAKE) -C kernel clean
 #endif
 	@$(MAKE) -C reference clean
-	@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
+	@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0
 ifeq ($(OSNAME), Darwin)
 	@rm -rf getarch.dSYM getarch_2nd.dSYM
 endif
 	@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
+	@rm -f cblas.tmp cblas.tmp2
 	@touch $(NETLIB_LAPACK_DIR)/make.inc
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
 	@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
diff --git a/Makefile.install b/Makefile.install
index dad869f4c..12713a6db 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -45,7 +45,16 @@ install : 	lib.grd
 
 ifndef NO_CBLAS
 	@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
-	@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
+	@cp cblas.h cblas.tmp
+ifdef SYMBOLPREFIX
+	@sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp
+endif
+ifdef SYMBOLSUFFIX
+	@sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2
+	@sed 's/(void)/$(SYMBOLSUFFIX)(void)/g'  cblas.tmp2 > cblas.tmp
+endif
+	@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
 endif
 
 ifneq ($(OSNAME), AIX)
@@ -168,4 +177,3 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!
-

From 60cd5e55fc2b8d50b52ebc54c701cb7315ad74ca Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 1 Aug 2020 12:31:39 +0200
Subject: [PATCH 6/8] Protect against inadvertent activation of USE_CUDA

---
 driver/others/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/Makefile b/driver/others/Makefile
index 5653f3c25..7558ec058 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -47,8 +47,10 @@ endif
 endif
 
 ifdef USE_CUDA
+ifeq ($(USE_CUDA), 1)
 COMMONOBJS	+= cuda_init.$(SUFFIX)
 endif
+endif
 
 ifdef FUNCTION_PROFILE
 COMMONOBJS	+= profile.$(SUFFIX)

From ecf4b9e0fca35ed15e3b0354002584fbd29a6166 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 1 Aug 2020 17:06:03 +0200
Subject: [PATCH 7/8] Improve substitution rules for SYMBOLPREFIX and -SUFFIX
 addition

---
 Makefile.install | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index 12713a6db..01c0b1226 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -47,12 +47,18 @@ ifndef NO_CBLAS
 	@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
 	@cp cblas.h cblas.tmp
 ifdef SYMBOLPREFIX
-	@sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2
-	@sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp
+	@sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
+	#change back any openblas_complex_float and double that got hit
+	@sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g'  cblas.tmp > cblas.tmp2
+	@sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
 endif
 ifdef SYMBOLSUFFIX
-	@sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2
-	@sed 's/(void)/$(SYMBOLSUFFIX)(void)/g'  cblas.tmp2 > cblas.tmp
+	@sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
+	#change back any openblas_complex_float and double that got hit
+	@sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g'  cblas.tmp > cblas.tmp2
+	@sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
 endif
 	@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
 endif

From 6794ac34153d9def9a1056738090160868417702 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 2 Aug 2020 11:20:08 +0200
Subject: [PATCH 8/8] Add SYMBOLPREFIX and/or -SUFFIX to cblas.h if needed

---
 CMakeLists.txt | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e51e7e38..c324e2241 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 9.dev)
+set(OpenBLAS_PATCH_VERSION 10.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
@@ -249,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
   endif()
 endif()
 
-if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
+if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
 if (NOT DEFINED ARCH)
   set(ARCH_IN "x86_64")
 else()
@@ -358,10 +358,21 @@ endif()
 
 if(NOT NO_CBLAS)
 	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
-
 	set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
 	file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
 	string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+	if (NOT ${SYMBOLPREFIX} STREQUAL "")
+	string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
+	string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+	endif()
+	if (NOT ${SYMBOLSUFFIX} STREQUAL "")
+	string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+	string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
+	endif()
 	file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
 	install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()