From edc97918f8e45e6e922d0e221cf103a4c736ca61 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:45:58 +1000 Subject: [PATCH] Optimise srot for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/srot_microk_power8.c | 64 +++++++++++++++---------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 0a18c16e0..6eecb60a1 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xscvdpspn 37, %x14 \n\t" // load s to all words "xxspltw 37, 37, 0 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" @@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y @@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n\t" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" @@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"