From bd831a03a80d642693c786f7a65265ad40a50fc0 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:47:56 +1000 Subject: [PATCH] Optimise sscal for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sscal_microk_power8.c | 80 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 49862a329..058ff3399 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xscvdpspn %x3, %x3 \n\t" "xxspltw %x3, %x3, 0 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n\t" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" @@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : @@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x) ".p2align 5 \n" "1: \n\t" - "stxvw4x %x3, 0, %2 \n\t" - "stxvw4x %x3, %4, %2 \n\t" - "stxvw4x %x3, %5, %2 \n\t" - "stxvw4x %x3, %6, %2 \n\t" - "stxvw4x %x3, %7, %2 \n\t" - "stxvw4x %x3, %8, %2 \n\t" - "stxvw4x %x3, %9, %2 \n\t" - "stxvw4x %x3, %10, %2 \n\t" + "stxvd2x %x3, 0, %2 \n\t" + "stxvd2x %x3, %4, %2 \n\t" + "stxvd2x %x3, %5, %2 \n\t" + "stxvd2x %x3, %6, %2 \n\t" + "stxvd2x %x3, %7, %2 \n\t" + "stxvd2x %x3, %8, %2 \n\t" + "stxvd2x %x3, %9, %2 \n\t" + "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t"