From 96dd0ef4f71da18324864e979133e873aa66306a Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 14:25:10 +1000
Subject: [PATCH 1/9] Optimise ccopy for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/ccopy_microk_power8.c | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index b2b1bead1..613c4d286 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %5, %2	\n\t"
-       "lxvw4x		34, %6, %2	\n\t"
-       "lxvw4x		35, %7, %2	\n\t"
-       "lxvw4x		36, %8, %2	\n\t"
-       "lxvw4x		37, %9, %2	\n\t"
-       "lxvw4x		38, %10, %2	\n\t"
-       "lxvw4x		39, %11, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %5, %2	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "lxvw4x		34, %6, %2	\n\t"
-       "lxvw4x		35, %7, %2	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "lxvw4x		36, %8, %2	\n\t"
-       "lxvw4x		37, %9, %2	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
-       "lxvw4x		38, %10, %2	\n\t"
-       "lxvw4x		39, %11, %2	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%2, %2, 128	\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%2, %2, 128	\n\t"
@@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
 
      "2:				\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"
 
      "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :

From be55f96cbdc919c7c7da2da2f7a2c6c47336a9f6 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 14:58:00 +1000
Subject: [PATCH 2/9] Optimise scopy for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/scopy_microk_power8.c | 64 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 444a6d4d5..7a54d5e1e 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%2, %2, 128	\n\t"
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 
      "2:				\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"
 
      "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :

From 6f4eca5ea4ab00726199277bb7a079900d20d388 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:23:20 +1000
Subject: [PATCH 3/9] Optimise sswap for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sswap_microk_power8.c | 64 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index d44f16765..cfefdd6ef 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "lxvw4x		32, 0, %4	\n\t"
-       "lxvw4x		33, %5, %4	\n\t"
-       "lxvw4x		34, %6, %4	\n\t"
-       "lxvw4x		35, %7, %4	\n\t"
-       "lxvw4x		36, %8, %4	\n\t"
-       "lxvw4x		37, %9, %4	\n\t"
-       "lxvw4x		38, %10, %4	\n\t"
-       "lxvw4x		39, %11, %4	\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"
 
-       "lxvw4x		40, 0, %3	\n\t"
-       "lxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		42, %6, %3	\n\t"
-       "lxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		44, %8, %3	\n\t"
-       "lxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		46, %10, %3	\n\t"
-       "lxvw4x		47, %11, %3	\n\t"
+       "lxvd2x		40, 0, %3	\n\t"
+       "lxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		42, %6, %3	\n\t"
+       "lxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		44, %8, %3	\n\t"
+       "lxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		46, %10, %3	\n\t"
+       "lxvd2x		47, %11, %3	\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		40, 0, %4	\n\t"
-       "stxvw4x		41, %5, %4	\n\t"
-       "stxvw4x		42, %6, %4	\n\t"
-       "stxvw4x		43, %7, %4	\n\t"
-       "stxvw4x		44, %8, %4	\n\t"
-       "stxvw4x		45, %9, %4	\n\t"
-       "stxvw4x		46, %10, %4	\n\t"
-       "stxvw4x		47, %11, %4	\n\t"
+       "stxvd2x		40, 0, %4	\n\t"
+       "stxvd2x		41, %5, %4	\n\t"
+       "stxvd2x		42, %6, %4	\n\t"
+       "stxvd2x		43, %7, %4	\n\t"
+       "stxvd2x		44, %8, %4	\n\t"
+       "stxvd2x		45, %9, %4	\n\t"
+       "stxvd2x		46, %10, %4	\n\t"
+       "stxvd2x		47, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 

From 4f09030fdc36444709cf3af9041a8043f1f6d83d Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:36:10 +1000
Subject: [PATCH 4/9] Optimise cswap for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/cswap_microk_power8.c | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 1dd03dc88..8d7d0c0b9 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "lxvw4x		32, 0, %4	\n\t"
-       "lxvw4x		33, %5, %4	\n\t"
-       "lxvw4x		34, %6, %4	\n\t"
-       "lxvw4x		35, %7, %4	\n\t"
-       "lxvw4x		36, %8, %4	\n\t"
-       "lxvw4x		37, %9, %4	\n\t"
-       "lxvw4x		38, %10, %4	\n\t"
-       "lxvw4x		39, %11, %4	\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 
-       "lxvw4x		40, 0, %4	\n\t"
-       "lxvw4x		41, %5, %4	\n\t"
-       "lxvw4x		42, %6, %4	\n\t"
-       "lxvw4x		43, %7, %4	\n\t"
-       "lxvw4x		44, %8, %4	\n\t"
-       "lxvw4x		45, %9, %4	\n\t"
-       "lxvw4x		46, %10, %4	\n\t"
-       "lxvw4x		47, %11, %4	\n\t"
+       "lxvd2x		40, 0, %4	\n\t"
+       "lxvd2x		41, %5, %4	\n\t"
+       "lxvd2x		42, %6, %4	\n\t"
+       "lxvd2x		43, %7, %4	\n\t"
+       "lxvd2x		44, %8, %4	\n\t"
+       "lxvd2x		45, %9, %4	\n\t"
+       "lxvd2x		46, %10, %4	\n\t"
+       "lxvd2x		47, %11, %4	\n\t"
 
        "addi		%4, %4, -128	\n\t"
 
-       "lxvw4x		48, 0, %3	\n\t"
-       "lxvw4x		49, %5, %3	\n\t"
-       "lxvw4x		50, %6, %3	\n\t"
-       "lxvw4x		51, %7, %3	\n\t"
-       "lxvw4x		0, %8, %3	\n\t"
-       "lxvw4x		1, %9, %3	\n\t"
-       "lxvw4x		2, %10, %3	\n\t"
-       "lxvw4x		3, %11, %3	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		49, %5, %3	\n\t"
+       "lxvd2x		50, %6, %3	\n\t"
+       "lxvd2x		51, %7, %3	\n\t"
+       "lxvd2x		0, %8, %3	\n\t"
+       "lxvd2x		1, %9, %3	\n\t"
+       "lxvd2x		2, %10, %3	\n\t"
+       "lxvd2x		3, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "lxvw4x		4, 0, %3	\n\t"
-       "lxvw4x		5, %5, %3	\n\t"
-       "lxvw4x		6, %6, %3	\n\t"
-       "lxvw4x		7, %7, %3	\n\t"
-       "lxvw4x		8, %8, %3	\n\t"
-       "lxvw4x		9, %9, %3	\n\t"
-       "lxvw4x		10, %10, %3	\n\t"
-       "lxvw4x		11, %11, %3	\n\t"
+       "lxvd2x		4, 0, %3	\n\t"
+       "lxvd2x		5, %5, %3	\n\t"
+       "lxvd2x		6, %6, %3	\n\t"
+       "lxvd2x		7, %7, %3	\n\t"
+       "lxvd2x		8, %8, %3	\n\t"
+       "lxvd2x		9, %9, %3	\n\t"
+       "lxvd2x		10, %10, %3	\n\t"
+       "lxvd2x		11, %11, %3	\n\t"
 
        "addi		%3, %3, -128	\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		48, 0, %4	\n\t"
-       "stxvw4x		49, %5, %4	\n\t"
-       "stxvw4x		50, %6, %4	\n\t"
-       "stxvw4x		51, %7, %4	\n\t"
-       "stxvw4x		0, %8, %4	\n\t"
-       "stxvw4x		1, %9, %4	\n\t"
-       "stxvw4x		2, %10, %4	\n\t"
-       "stxvw4x		3, %11, %4	\n\t"
+       "stxvd2x		48, 0, %4	\n\t"
+       "stxvd2x		49, %5, %4	\n\t"
+       "stxvd2x		50, %6, %4	\n\t"
+       "stxvd2x		51, %7, %4	\n\t"
+       "stxvd2x		0, %8, %4	\n\t"
+       "stxvd2x		1, %9, %4	\n\t"
+       "stxvd2x		2, %10, %4	\n\t"
+       "stxvd2x		3, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 
-       "stxvw4x		4, 0, %4	\n\t"
-       "stxvw4x		5, %5, %4	\n\t"
-       "stxvw4x		6, %6, %4	\n\t"
-       "stxvw4x		7, %7, %4	\n\t"
-       "stxvw4x		8, %8, %4	\n\t"
-       "stxvw4x		9, %9, %4	\n\t"
-       "stxvw4x		10, %10, %4	\n\t"
-       "stxvw4x		11, %11, %4	\n\t"
+       "stxvd2x		4, 0, %4	\n\t"
+       "stxvd2x		5, %5, %4	\n\t"
+       "stxvd2x		6, %6, %4	\n\t"
+       "stxvd2x		7, %7, %4	\n\t"
+       "stxvd2x		8, %8, %4	\n\t"
+       "stxvd2x		9, %9, %4	\n\t"
+       "stxvd2x		10, %10, %4	\n\t"
+       "stxvd2x		11, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 

From 19bdf9d52b222a4edd3e1710023af8c40f84c255 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:38:32 +1000
Subject: [PATCH 5/9] Optimise casum for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/casum_microk_power8.c | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index 93ba50660..7d12c9885 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
        "xvabssp		50, 42		\n\t"
        "xvabssp		51, 43		\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
 
        "xvabssp		%x3, 44		\n\t"
        "xvabssp		%x4, 45		\n\t"
 
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
 
        "xvabssp		%x5, 46		\n\t"
        "xvabssp		%x6, 47		\n\t"
 
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
 
        "xvaddsp		32, 32, 48	\n\t"
        "xvaddsp		33, 33, 49	\n\t"
 
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "xvaddsp		34, 34, 50	\n\t"
        "xvaddsp		35, 35, 51	\n\t"

From 32c7fe6bff6f04d61e6a09d10199a14e63e77083 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:39:27 +1000
Subject: [PATCH 6/9] Optimise sasum for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sasum_microk_power8.c | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 08a766f80..4bb515de8 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
        "xvabssp		50, 42		\n\t"
        "xvabssp		51, 43		\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
 
        "xvabssp		%x3, 44		\n\t"
        "xvabssp		%x4, 45		\n\t"
 
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
 
        "xvabssp		%x5, 46		\n\t"
        "xvabssp		%x6, 47		\n\t"
 
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
 
        "xvaddsp		32, 32, 48	\n\t"
        "xvaddsp		33, 33, 49	\n\t"
 
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "xvaddsp		34, 34, 50	\n\t"
        "xvaddsp		35, 35, 51	\n\t"

From e0034de22d9a789988e29e3b67a796cee0c97965 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:43:31 +1000
Subject: [PATCH 7/9] Optimise sdot for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sdot_microk_power8.c | 64 +++++++++++++++----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index 7f7ccfac3..bfe100c8b 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		48, 0, %3	\n\t"
-       "lxvw4x		41, %10, %2	\n\t"
-       "lxvw4x		49, %10, %3	\n\t"
-       "lxvw4x		42, %11, %2	\n\t"
-       "lxvw4x		50, %11, %3	\n\t"
-       "lxvw4x		43, %12, %2	\n\t"
-       "lxvw4x		51, %12, %3	\n\t"
-       "lxvw4x		44, %13, %2	\n\t"
-       "lxvw4x		%x4, %13, %3	\n\t"
-       "lxvw4x		45, %14, %2	\n\t"
-       "lxvw4x		%x5, %14, %3	\n\t"
-       "lxvw4x		46, %15, %2	\n\t"
-       "lxvw4x		%x6, %15, %3	\n\t"
-       "lxvw4x		47, %16, %2	\n\t"
-       "lxvw4x		%x7, %16, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"
 
        "addi		%2, %2, 128	\n\t"
        "addi		%3, %3, 128	\n\t"
@@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
      "1:				\n\t"
 
        "xvmaddasp	32, 40, 48	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		48, 0, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
        "xvmaddasp	33, 41, 49	\n\t"
-       "lxvw4x		41, %10, %2	\n\t"
-       "lxvw4x		49, %10, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
        "xvmaddasp	34, 42, 50	\n\t"
-       "lxvw4x		42, %11, %2	\n\t"
-       "lxvw4x		50, %11, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
        "xvmaddasp	35, 43, 51	\n\t"
-       "lxvw4x		43, %12, %2	\n\t"
-       "lxvw4x		51, %12, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
        "xvmaddasp	36, 44, %x4	\n\t"
-       "lxvw4x		44, %13, %2	\n\t"
-       "lxvw4x		%x4, %13, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
        "xvmaddasp	37, 45, %x5	\n\t"
-       "lxvw4x		45, %14, %2	\n\t"
-       "lxvw4x		%x5, %14, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
        "xvmaddasp	38, 46, %x6	\n\t"
-       "lxvw4x		46, %15, %2	\n\t"
-       "lxvw4x		%x6, %15, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
        "xvmaddasp	39, 47, %x7	\n\t"
-       "lxvw4x		47, %16, %2	\n\t"
-       "lxvw4x		%x7, %16, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"
 
        "addi		%2, %2, 128	\n\t"
        "addi		%3, %3, 128	\n\t"

From edc97918f8e45e6e922d0e221cf103a4c736ca61 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:45:58 +1000
Subject: [PATCH 8/9] Optimise srot for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/srot_microk_power8.c | 64 +++++++++++++++----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index 0a18c16e0..6eecb60a1 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xscvdpspn	37, %x14	\n\t"	// load s to all words
        "xxspltw		37, 37, 0	\n\t"
 
-       "lxvw4x		32, 0, %3	\n\t"	// load x
-       "lxvw4x		33, %15, %3	\n\t"
-       "lxvw4x		34, %16, %3	\n\t"
-       "lxvw4x		35, %17, %3	\n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"
 
-       "lxvw4x		48, 0, %4	\n\t"	// load y
-       "lxvw4x		49, %15, %4	\n\t"
-       "lxvw4x		50, %16, %4	\n\t"
-       "lxvw4x		51, %17, %4	\n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"
 
        "addi		%3, %3, 64	\n\t"
        "addi		%4, %4, 64	\n\t"
@@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xvmulsp		44, 32, 37	\n\t"	// s * x
        "xvmulsp		45, 33, 37	\n\t"
 
-       "lxvw4x		32, 0, %3	\n\t"	// load x
-       "lxvw4x		33, %15, %3	\n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
 
        "xvmulsp		46, 34, 37	\n\t"
        "xvmulsp		47, 35, 37	\n\t"
 
-       "lxvw4x		34, %16, %3	\n\t"
-       "lxvw4x		35, %17, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"
 
        "xvmulsp		%x9, 48, 37	\n\t"	// s * y
        "xvmulsp		%x10, 49, 37	\n\t"
 
-       "lxvw4x		48, 0, %4	\n\t"	// load y
-       "lxvw4x		49, %15, %4	\n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
 
        "xvmulsp		%x11, 50, 37	\n\t"
        "xvmulsp		%x12, 51, 37	\n\t"
 
-       "lxvw4x		50, %16, %4	\n\t"
-       "lxvw4x		51, %17, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"
 
        "xvaddsp		40, 40, %x9	\n\t"	// c * x + s * y
        "xvaddsp		41, 41, %x10	\n\t"	// c * x + s * y
@@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
        "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x
 
-       "stxvw4x		40, 0, %3	\n\t"	// store x
-       "stxvw4x		41, %15, %3	\n\t"
-       "stxvw4x		42, %16, %3	\n\t"
-       "stxvw4x		43, %17, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"
 
-       "stxvw4x		%x5, 0, %4	\n\t"	// store y
-       "stxvw4x		%x6, %15, %4	\n\t"
-       "stxvw4x		%x7, %16, %4	\n\t"
-       "stxvw4x		%x8, %17, %4	\n\t"
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%4, %4, 128	\n\t"
@@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
        "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x
 
-       "stxvw4x		40, 0, %3	\n\t"	// store x
-       "stxvw4x		41, %15, %3	\n\t"
-       "stxvw4x		42, %16, %3	\n\t"
-       "stxvw4x		43, %17, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"
 
-       "stxvw4x		%x5, 0, %4	\n\t"	// store y
-       "stxvw4x		%x6, %15, %4	\n\t"
-       "stxvw4x		%x7, %16, %4	\n\t"
-       "stxvw4x		%x8, %17, %4	\n"
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n"
 
      "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
      "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"

From bd831a03a80d642693c786f7a65265ad40a50fc0 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:47:56 +1000
Subject: [PATCH 9/9] Optimise sscal for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sscal_microk_power8.c | 80 +++++++++++++++---------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 49862a329..058ff3399 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "xscvdpspn	%x3, %x3	\n\t"
        "xxspltw		%x3, %x3, 0	\n\t"
 
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %4, %2	\n\t"
-       "lxvw4x		34, %5, %2	\n\t"
-       "lxvw4x		35, %6, %2	\n\t"
-       "lxvw4x		36, %7, %2	\n\t"
-       "lxvw4x		37, %8, %2	\n\t"
-       "lxvw4x		38, %9, %2	\n\t"
-       "lxvw4x		39, %10, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
 
        "xvmulsp		40, 32, %x3	\n\t"
        "xvmulsp		41, 33, %x3	\n\t"
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %4, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
        "xvmulsp		42, 34, %x3	\n\t"
        "xvmulsp		43, 35, %x3	\n\t"
-       "lxvw4x		34, %5, %2	\n\t"
-       "lxvw4x		35, %6, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
        "xvmulsp		44, 36, %x3	\n\t"
        "xvmulsp		45, 37, %x3	\n\t"
-       "lxvw4x		36, %7, %2	\n\t"
-       "lxvw4x		37, %8, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
        "xvmulsp		46, 38, %x3	\n\t"
        "xvmulsp		47, 39, %x3	\n\t"
-       "lxvw4x		38, %9, %2	\n\t"
-       "lxvw4x		39, %10, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"
 
        "addi		%2, %2, -128	\n\t"
 
-       "stxvw4x		40, 0, %2	\n\t"
-       "stxvw4x		41, %4, %2	\n\t"
-       "stxvw4x		42, %5, %2	\n\t"
-       "stxvw4x		43, %6, %2	\n\t"
-       "stxvw4x		44, %7, %2	\n\t"
-       "stxvw4x		45, %8, %2	\n\t"
-       "stxvw4x		46, %9, %2	\n\t"
-       "stxvw4x		47, %10, %2	\n\t"
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n\t"
 
        "addi		%2, %2, 256	\n\t"
 
@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "xvmulsp		46, 38, %x3	\n\t"
        "xvmulsp		47, 39, %x3	\n\t"
 
-       "stxvw4x		40, 0, %2	\n\t"
-       "stxvw4x		41, %4, %2	\n\t"
-       "stxvw4x		42, %5, %2	\n\t"
-       "stxvw4x		43, %6, %2	\n\t"
-       "stxvw4x		44, %7, %2	\n\t"
-       "stxvw4x		45, %8, %2	\n\t"
-       "stxvw4x		46, %9, %2	\n\t"
-       "stxvw4x		47, %10, %2	\n"
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n"
 
      "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
      :
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "stxvw4x		%x3, 0, %2	\n\t"
-       "stxvw4x		%x3, %4, %2	\n\t"
-       "stxvw4x		%x3, %5, %2	\n\t"
-       "stxvw4x		%x3, %6, %2	\n\t"
-       "stxvw4x		%x3, %7, %2	\n\t"
-       "stxvw4x		%x3, %8, %2	\n\t"
-       "stxvw4x		%x3, %9, %2	\n\t"
-       "stxvw4x		%x3, %10, %2	\n\t"
+       "stxvd2x		%x3, 0, %2	\n\t"
+       "stxvd2x		%x3, %4, %2	\n\t"
+       "stxvd2x		%x3, %5, %2	\n\t"
+       "stxvd2x		%x3, %6, %2	\n\t"
+       "stxvd2x		%x3, %7, %2	\n\t"
+       "stxvd2x		%x3, %8, %2	\n\t"
+       "stxvd2x		%x3, %9, %2	\n\t"
+       "stxvd2x		%x3, %10, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"