From 96dd0ef4f71da18324864e979133e873aa66306a Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 14:25:10 +1000 Subject: [PATCH 01/35] Optimise ccopy for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/ccopy_microk_power8.c | 128 ++++++++++++++--------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index b2b1bead1..613c4d286 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : From be55f96cbdc919c7c7da2da2f7a2c6c47336a9f6 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 14:58:00 +1000 Subject: [PATCH 02/35] Optimise scopy for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/scopy_microk_power8.c | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 444a6d4d5..7a54d5e1e 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : From 6f4eca5ea4ab00726199277bb7a079900d20d388 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:23:20 +1000 Subject: [PATCH 03/35] Optimise sswap for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sswap_microk_power8.c | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index d44f16765..cfefdd6ef 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" - "lxvw4x 40, 0, %3 \n\t" - "lxvw4x 41, %5, %3 \n\t" - "lxvw4x 42, %6, %3 \n\t" - "lxvw4x 43, %7, %3 \n\t" - "lxvw4x 44, %8, %3 \n\t" - "lxvw4x 45, %9, %3 \n\t" - "lxvw4x 46, %10, %3 \n\t" - "lxvw4x 47, %11, %3 \n\t" + "lxvd2x 40, 0, %3 \n\t" + "lxvd2x 41, %5, %3 \n\t" + "lxvd2x 42, %6, %3 \n\t" + "lxvd2x 43, %7, %3 \n\t" + "lxvd2x 44, %8, %3 \n\t" + "lxvd2x 45, %9, %3 \n\t" + "lxvd2x 46, %10, %3 \n\t" + "lxvd2x 47, %11, %3 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %4 \n\t" - "stxvw4x 41, %5, %4 \n\t" - "stxvw4x 42, %6, %4 \n\t" - "stxvw4x 43, %7, %4 \n\t" - "stxvw4x 44, %8, %4 \n\t" - "stxvw4x 45, %9, %4 \n\t" - "stxvw4x 46, %10, %4 \n\t" - "stxvw4x 47, %11, %4 \n\t" + "stxvd2x 40, 0, %4 \n\t" + "stxvd2x 41, %5, %4 \n\t" + "stxvd2x 42, %6, %4 \n\t" + "stxvd2x 43, %7, %4 \n\t" + "stxvd2x 44, %8, %4 \n\t" + "stxvd2x 45, %9, %4 \n\t" + "stxvd2x 46, %10, %4 \n\t" + "stxvd2x 47, %11, %4 \n\t" "addi %4, %4, 128 \n\t" From 4f09030fdc36444709cf3af9041a8043f1f6d83d Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:36:10 +1000 Subject: [PATCH 04/35] Optimise cswap for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/cswap_microk_power8.c | 128 ++++++++++++++--------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 1dd03dc88..8d7d0c0b9 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "lxvw4x 40, 0, %4 \n\t" - "lxvw4x 41, %5, %4 \n\t" - "lxvw4x 42, %6, %4 \n\t" - "lxvw4x 43, %7, %4 \n\t" - "lxvw4x 44, %8, %4 \n\t" - "lxvw4x 45, %9, %4 \n\t" - "lxvw4x 46, %10, %4 \n\t" - "lxvw4x 47, %11, %4 \n\t" + "lxvd2x 40, 0, %4 \n\t" + "lxvd2x 41, %5, %4 \n\t" + "lxvd2x 42, %6, %4 \n\t" + "lxvd2x 43, %7, %4 \n\t" + "lxvd2x 44, %8, %4 \n\t" + "lxvd2x 45, %9, %4 \n\t" + "lxvd2x 46, %10, %4 \n\t" + "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 49, %5, %3 \n\t" - "lxvw4x 50, %6, %3 \n\t" - "lxvw4x 51, %7, %3 \n\t" - "lxvw4x 0, %8, %3 \n\t" - "lxvw4x 1, %9, %3 \n\t" - "lxvw4x 2, %10, %3 \n\t" - "lxvw4x 3, %11, %3 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "lxvd2x 0, %8, %3 \n\t" + "lxvd2x 1, %9, %3 \n\t" + "lxvd2x 2, %10, %3 \n\t" + "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "lxvw4x 4, 0, %3 \n\t" - "lxvw4x 5, %5, %3 \n\t" - "lxvw4x 6, %6, %3 \n\t" - "lxvw4x 7, %7, %3 \n\t" - "lxvw4x 8, %8, %3 \n\t" - "lxvw4x 9, %9, %3 \n\t" - "lxvw4x 10, %10, %3 \n\t" - "lxvw4x 11, %11, %3 \n\t" + "lxvd2x 4, 0, %3 \n\t" + "lxvd2x 5, %5, %3 \n\t" + "lxvd2x 6, %6, %3 \n\t" + "lxvd2x 7, %7, %3 \n\t" + "lxvd2x 8, %8, %3 \n\t" + "lxvd2x 9, %9, %3 \n\t" + "lxvd2x 10, %10, %3 \n\t" + "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 48, 0, %4 \n\t" - "stxvw4x 49, %5, %4 \n\t" - "stxvw4x 50, %6, %4 \n\t" - "stxvw4x 51, %7, %4 \n\t" - "stxvw4x 0, %8, %4 \n\t" - "stxvw4x 1, %9, %4 \n\t" - "stxvw4x 2, %10, %4 \n\t" - "stxvw4x 3, %11, %4 \n\t" + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 0, %8, %4 \n\t" + "stxvd2x 1, %9, %4 \n\t" + "stxvd2x 2, %10, %4 \n\t" + "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "stxvw4x 4, 0, %4 \n\t" - "stxvw4x 5, %5, %4 \n\t" - "stxvw4x 6, %6, %4 \n\t" - "stxvw4x 7, %7, %4 \n\t" - "stxvw4x 8, %8, %4 \n\t" - "stxvw4x 9, %9, %4 \n\t" - "stxvw4x 10, %10, %4 \n\t" - "stxvw4x 11, %11, %4 \n\t" + "stxvd2x 4, 0, %4 \n\t" + "stxvd2x 5, %5, %4 \n\t" + "stxvd2x 6, %6, %4 \n\t" + "stxvd2x 7, %7, %4 \n\t" + "stxvd2x 8, %8, %4 \n\t" + "stxvd2x 9, %9, %4 \n\t" + "stxvd2x 10, %10, %4 \n\t" + "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" From 19bdf9d52b222a4edd3e1710023af8c40f84c255 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:38:32 +1000 Subject: [PATCH 05/35] Optimise casum for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/casum_microk_power8.c | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 93ba50660..7d12c9885 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" From 32c7fe6bff6f04d61e6a09d10199a14e63e77083 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:39:27 +1000 Subject: [PATCH 06/35] Optimise sasum for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sasum_microk_power8.c | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 08a766f80..4bb515de8 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" From e0034de22d9a789988e29e3b67a796cee0c97965 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:43:31 +1000 Subject: [PATCH 07/35] Optimise sdot for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sdot_microk_power8.c | 64 +++++++++++++++---------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index 7f7ccfac3..bfe100c8b 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" @@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y) "1: \n\t" "xvmaddasp 32, 40, 48 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" "xvmaddasp 33, 41, 49 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" "xvmaddasp 34, 42, 50 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" "xvmaddasp 35, 43, 51 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" "xvmaddasp 36, 44, %x4 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" "xvmaddasp 37, 45, %x5 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" "xvmaddasp 38, 46, %x6 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" "xvmaddasp 39, 47, %x7 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" From edc97918f8e45e6e922d0e221cf103a4c736ca61 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:45:58 +1000 Subject: [PATCH 08/35] Optimise srot for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/srot_microk_power8.c | 64 +++++++++++++++---------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 0a18c16e0..6eecb60a1 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xscvdpspn 37, %x14 \n\t" // load s to all words "xxspltw 37, 37, 0 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" @@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y @@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n\t" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" @@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" From bd831a03a80d642693c786f7a65265ad40a50fc0 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:47:56 +1000 Subject: [PATCH 09/35] Optimise sscal for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sscal_microk_power8.c | 80 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 49862a329..058ff3399 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xscvdpspn %x3, %x3 \n\t" "xxspltw %x3, %x3, 0 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n\t" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" @@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : @@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x) ".p2align 5 \n" "1: \n\t" - "stxvw4x %x3, 0, %2 \n\t" - "stxvw4x %x3, %4, %2 \n\t" - "stxvw4x %x3, %5, %2 \n\t" - "stxvw4x %x3, %6, %2 \n\t" - "stxvw4x %x3, %7, %2 \n\t" - "stxvw4x %x3, %8, %2 \n\t" - "stxvw4x %x3, %9, %2 \n\t" - "stxvw4x %x3, %10, %2 \n\t" + "stxvd2x %x3, 0, %2 \n\t" + "stxvd2x %x3, %4, %2 \n\t" + "stxvd2x %x3, %5, %2 \n\t" + "stxvd2x %x3, %6, %2 \n\t" + "stxvd2x %x3, %7, %2 \n\t" + "stxvd2x %x3, %8, %2 \n\t" + "stxvd2x %x3, %9, %2 \n\t" + "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t" From 34513be72654102504f231cc27b33d26eddf88ac Mon Sep 17 00:00:00 2001 From: Neil Shipp Date: Fri, 23 Jun 2017 13:07:34 -0700 Subject: [PATCH 10/35] Add Microsoft Windows 10 UWP build support --- CMakeLists.txt | 9 ++++- cmake/c_check.cmake | 5 +++ cmake/prebuild.cmake | 62 ++++++++++++++++++------------- common.h | 13 ++++++- driver/others/CMakeLists.txt | 2 + driver/others/blas_server_win32.c | 7 +++- utest/CMakeLists.txt | 4 ++ 7 files changed, 70 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c20a57eac..e6ae891b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h ) - ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h) + + ADD_CUSTOM_TARGET(genconfig + ALL + DEPENDS openblas_config.h + ) add_dependencies(genconfig ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) @@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME} message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(genf77blas + ALL COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h ) @@ -255,11 +260,11 @@ if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(gencblas + ALL COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h ) - add_dependencies(gencblas ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 115bdaf4e..56ae612ea 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF} "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${FU}\n") +if (${HOST_OS} STREQUAL "WINDOWSSTORE") + file(APPEND ${TARGET_CONF} + "#define OS_WINNT\t1\n") +endif () + diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6a21c0bcc..a7f98bfb8 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -72,20 +72,26 @@ if (MSVC) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) endif() +if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + # disable WindowsStore strict CRT checks + set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) +endif () + set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) -try_compile(GETARCH_RESULT ${GETARCH_DIR} - SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH_RESULT ${GETARCH_DIR} + SOURCES ${GETARCH_SRC} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} + ) -if (NOT ${GETARCH_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + if (NOT ${GETARCH_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + endif () endif () - message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) -try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH2_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH2_RESULT ${GETARCH2_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH2_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} + ) -if (NOT ${GETARCH2_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + if (NOT ${GETARCH2_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + endif () endif () # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) -try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GEN_CONFIG_H_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GEN_CONFIG_H_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} + ) -if (NOT ${GEN_CONFIG_H_RESULT}) - MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") -endif () + if (NOT ${GEN_CONFIG_H_RESULT}) + MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") + endif () +endif () \ No newline at end of file diff --git a/common.h b/common.h index c9cc2f0f2..4463141c8 100644 --- a/common.h +++ b/common.h @@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #endif #ifndef ASSEMBLER +#ifdef OS_WINDOWSSTORE +typedef char env_var_t[MAX_PATH]; +#define readenv(p, n) 0 +#else #ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) @@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif +#endif #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK @@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){ *address = 0; } - +#ifdef OS_WINDOWSSTORE +static __inline int readenv_atoi(char *env) { + return 0; +} +#else #ifdef OS_WINDOWS static __inline int readenv_atoi(char *env) { env_var_t p; @@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) { return(0); } #endif - +#endif #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 489d40c76..8e0be1e0e 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -12,6 +12,8 @@ if (SMP) set(BLAS_SERVER blas_server_omp.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(BLAS_SERVER blas_server_win32.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") + set(BLAS_SERVER blas_server_win32.c) endif () if (NOT DEFINED BLAS_SERVER) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 081bdd7d4..cde8ca793 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ - WaitForSingleObject(blas_threads[i], 5); //INFINITE); - TerminateThread(blas_threads[i],0); + WaitForSingleObject(blas_threads[i], 5); //INFINITE); +#ifndef OS_WINDOWSSTORE +// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + TerminateThread(blas_threads[i],0); +#endif } blas_server_avail = 0; diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 9cf518e05..bd31ed9c6 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(${OpenBLAS_utest_bin} m) endif() +if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") +set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS") +endif() + #Set output for utest set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) From 0d5c8e53861adbb54ee9952ed76c2e8dd029ca96 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 12:43:13 +0530 Subject: [PATCH 11/35] arm: Determine the abi from compiler if not specified on command line If ARM abi is not explicitly mentioned on the command line, then set the arm abi to softfp or hard according to the compiler environment. This assumes that compiler sets the defines __ARM_PCS and __ARM_PCS_VFP accordingly. --- Makefile.arm | 23 +++-------------------- Makefile.system | 24 ++++++++++++++++-------- c_check | 16 +++++++++++++++- common_arm.h | 5 ----- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index c189b0c47..eedd39b73 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,5 +1,4 @@ -#ifeq logical or -ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) CCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a @@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a endif endif -ifeq ($(CORE), ARMV7) -ifeq ($(OSNAME), Android) -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a -else -CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -endif -else -CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -endif -endif - ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6 endif - ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -march=armv5 -FCOMMON_OPT += -marm -march=armv5 +CCOMMON_OPT += -march=armv5 +FCOMMON_OPT += -march=armv5 endif diff --git a/Makefile.system b/Makefile.system index 29d3efd53..2cae5f1c9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -242,6 +242,10 @@ EXTRALIB += -lm NO_EXPRECISION = 1 endif +ifeq ($(OSNAME), Android) +EXTRALIB += -lm +endif + ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif @@ -483,16 +487,20 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -CCOMMON_OPT += -marm -FCOMMON_OPT += -marm - -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI -FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI +# If ABI is specified on command line use it. Else use the automatically detected ABI. +ifeq ($(ARM_SOFTFP_ABI),1) +ARM_ABI = softfp else -CCOMMON_OPT += -mfloat-abi=hard -FCOMMON_OPT += -mfloat-abi=hard +ifeq ($(ARM_HARD_ABI),1) +ARM_ABI = hard +else +ARM_ABI=$(ARM_ABI_AUTO) endif +endif +export ARM_ABI_AUTO +CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) +FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) + endif ifeq ($(ARCH), arm64) diff --git a/c_check b/c_check index 20da288be..2e7e08cfb 100644 --- a/c_check +++ b/c_check @@ -94,7 +94,17 @@ if ($architecture eq "mips64") { $defined = 1; } -if (($architecture eq "arm") || ($architecture eq "arm64")) { +if ($architecture eq "arm") { + $defined = 1; + $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; + if ($data ne "") { + $abi = "hard"; + } else { + $abi = "softfp"; + } +} + +if ($architecture eq "arm64") { $defined = 1; } @@ -287,6 +297,10 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; +if ($architecture eq "arm") { + print MAKEFILE "ARM_ABI_AUTO=$abi\n"; +} + $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/common_arm.h b/common_arm.h index a17acb448..27fa76b76 100644 --- a/common_arm.h +++ b/common_arm.h @@ -111,11 +111,6 @@ REALNAME: #define PROFCODE -#ifdef __ARM_PCS -//-mfloat-abi=softfp -#define SOFT_FLOAT_ABI -#endif - #endif From da7f0ff425a4ccd9a4f32b7fff33b9ef807ad0f4 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 12:46:18 +0530 Subject: [PATCH 12/35] generic: add some generic gemm and trmm kernels Added generic 4x4 and 4x2 gemm kernels Added generic 4x2 trmm kernel --- kernel/generic/gemmkernel_4x2.c | 317 ++++++++++++++++++ kernel/generic/gemmkernel_4x4.c | 571 ++++++++++++++++++++++++++++++++ kernel/generic/trmmkernel_4x2.c | 528 +++++++++++++++++++++++++++++ 3 files changed, 1416 insertions(+) create mode 100644 kernel/generic/gemmkernel_4x2.c create mode 100644 kernel/generic/gemmkernel_4x4.c create mode 100644 kernel/generic/trmmkernel_4x2.c diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c new file mode 100644 index 000000000..1d15de1d7 --- /dev/null +++ b/kernel/generic/gemmkernel_4x2.c @@ -0,0 +1,317 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + + for (j=0; j<(bn/2); j+=2) + { + C0 = C; + C1 = C0+ldc; + + ptrba = ba; + + for (i=0; i + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + + for (j=0; j + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + for (j=0; j<(bn/2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + + ptrba = ba; + + for (i=0; i Date: Fri, 30 Jun 2017 13:06:38 +0530 Subject: [PATCH 13/35] arm: Use assembly implementations based on the ARM abi In case of softfp abi, assembly implementations of only those APIs are used which doesnt have a floating point argument or return value. In case of hard abi, all assembly implementations are used. --- kernel/arm/KERNEL.ARMV6 | 112 +++++++++++++++-------------------- kernel/arm/KERNEL.ARMV7 | 125 +++++----------------------------------- 2 files changed, 60 insertions(+), 177 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 16bde105b..a2dd4806d 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,7 +1,5 @@ +include $(KERNELDIR)/KERNEL.ARMV5 - - -############################################################################### SAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S @@ -34,6 +32,45 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S +SGEMMKERNEL = ../generic/gemmkernel_4x2.c +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = sgemm_ncopy_2_vfp.S +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_4x2.c +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif +DGEMMONCOPY = dgemm_ncopy_2_vfp.S +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +STRMMKERNEL = ../generic/trmmkernel_4x2.c +DTRMMKERNEL = ../generic/trmmkernel_4x2.c + +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(ARM_ABI),hard) + SASUMKERNEL = asum_vfp.S DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S @@ -44,11 +81,6 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S @@ -64,11 +96,6 @@ DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S @@ -84,63 +111,14 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S -SGEMMINCOPY = sgemm_ncopy_4_vfp.S -SGEMMITCOPY = sgemm_tcopy_4_vfp.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPY = sgemm_ncopy_2_vfp.S -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - DGEMMKERNEL = dgemm_kernel_4x2_vfp.S -DGEMMINCOPY = dgemm_ncopy_4_vfp.S -DGEMMITCOPY = dgemm_tcopy_4_vfp.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPY = dgemm_ncopy_2_vfp.S -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - CGEMMKERNEL = cgemm_kernel_2x2_vfp.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - +endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d5cd94fbd..d4829faa3 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,86 +1,29 @@ +include $(KERNELDIR)/KERNEL.ARMV6 -################################################################################# -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_4x4.c -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S +SGEMMKERNEL = ../generic/gemmkernel_4x4.c +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S +DGEMMKERNEL = ../generic/gemmkernel_4x4.c +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S - -ISAMAXKERNEL = iamax_vfp.S -IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S - -ISAMINKERNEL = iamax_vfp.S -IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S - -ISMAXKERNEL = iamax_vfp.S -IDMAXKERNEL = iamax_vfp.S - -ISMINKERNEL = iamax_vfp.S -IDMINKERNEL = iamax_vfp.S - -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S +ifeq ($(ARM_ABI),hard) SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S @@ -88,47 +31,9 @@ CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S -SGEMMONCOPY = sgemm_ncopy_4_vfp.S -SGEMMOTCOPY = sgemm_tcopy_4_vfp.S -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMONCOPY = dgemm_ncopy_4_vfp.S -DGEMMOTCOPY = dgemm_tcopy_4_vfp.S -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - +endif From aa5edebc80200a590362dc2229e2751d399c04aa Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 13:12:05 +0530 Subject: [PATCH 14/35] arm: add softfp support in kernel/arm/asum_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/asum_vfp.S | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index a2dd4806d..1da6e4e1f 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -32,6 +32,11 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -71,11 +76,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index fe6242a5b..5b08e5028 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -475,6 +475,14 @@ asum_kernel_L999: vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + bx lr EPILOGUE From 4f0773f07d07b9adad103e66d7b3abae108d9d31 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 20:06:29 +0530 Subject: [PATCH 15/35] arm: add softfp support in kernel/arm/axpy_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++--- kernel/arm/axpy_vfp.S | 71 ++++++++++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1da6e4e1f..63867d2b7 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -37,6 +37,11 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -76,11 +81,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62..a407b04bd 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_INC_X [fp, #0 ] -#define OLD_Y [fp, #4 ] -#define OLD_INC_Y [fp, #8 ] -#else +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] +#else +#define OLD_ALPHA [fp, #0] +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define OLD_Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] #endif - + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_ALPHAR [fp, #0] +#define OLD_ALPHAI [fp, #8] +#define OLD_X [fp, #16 ] +#define OLD_INC_X [fp, #20 ] +#define OLD_Y [fp, #24 ] +#define OLD_INC_Y [fp, #28 ] +#endif + +#endif //!defined(COMPLEX) + +#else //__ARM_PCS_VFP + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + +#endif //!defined(__ARM_PCS_VFP) + #define N r0 #define Y r1 #define INC_X r2 @@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 sub sp, sp, #STACKSIZE // reserve stack -#ifdef ARM_SOFTFP_ABI -#ifndef DOUBLE - vmov s0, r3 //move alpha to s0 +#if !defined(__ARM_PCS_VFP) +#if !defined(COMPLEX) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA + ldr X, OLD_X +#else + vldr d0, OLD_ALPHA ldr X, OLD_X #endif +#else //COMPLEX +#if !defined(DOUBLE) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr X, OLD_X +#else + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr X, OLD_X #endif - +#endif +#endif + ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y From 0150fabdb6250748bc45d18ccbb782331526c5cd Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 21:52:32 +0530 Subject: [PATCH 16/35] arm: add softfp support in kernel/arm/rot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/rot_vfp.S | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 63867d2b7..e9fe6bedd 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -42,6 +42,11 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -91,11 +96,6 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b6..6e679ecf9 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_INC_Y [fp, #0 ] +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) +#define OLD_C [fp, #4] +#define OLD_S [fp, #8] +#else +#define OLD_C [fp, #8] +#define OLD_S [fp, #16] +#endif +#endif #define N r0 #define X r1 @@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 ldr INC_Y , OLD_INC_Y - +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vldr s0, OLD_C + vldr s1, OLD_S +#else + vldr d0, OLD_C + vldr d1, OLD_S +#endif +#endif cmp N, #0 ble rot_kernel_L999 From 54915ce343dba7e00dba21c73b9fd35bcded0de3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 23:46:02 +0530 Subject: [PATCH 17/35] arm: add softfp support in kernel/arm/*dot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/cdot_vfp.S | 32 ++++++++++++++++++++++++++------ kernel/arm/ddot_vfp.S | 3 +++ kernel/arm/sdot_vfp.S | 13 ++++++------- kernel/arm/zdot_vfp.S | 32 ++++++++++++++++++++++++++------ 5 files changed, 66 insertions(+), 24 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e9fe6bedd..022547c9b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -47,6 +47,11 @@ DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -86,11 +91,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S - SNRM2KERNEL = nrm2_vfp.S DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index 0497b6d83..e5a6e4d35 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov s2, s0 vmov s3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y +#endif cmp N, #0 ble cdot_kernel_L999 @@ -265,7 +283,6 @@ cdot_kernel_S10: cdot_kernel_L999: - sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers @@ -276,8 +293,11 @@ cdot_kernel_L999: vadd.f32 s0 , s0, s2 vsub.f32 s1 , s1, s3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {s0 - s1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index f28acbae3..fb294d8b4 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -246,6 +246,9 @@ ddot_kernel_L999: vldm r3, { d8 - d15} // restore floating point registers vadd.f64 d0 , d0, d1 // set return value +#if !defined(__ARM_PCS_VFP) + vmov r0, r1, d0 +#endif sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index f3abdc197..5f4f424bf 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -329,20 +329,19 @@ sdot_kernel_L999: vldm r3, { s8 - s15} // restore floating point registers #if defined(DSDOT) - vadd.f64 d0 , d0, d1 // set return value - -#ifdef ARM_SOFTFP_ABI - vmov r0, r1, d0 +#else + vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if defined(DSDOT) + vmov r0, r1, d0 #else - - vadd.f32 s0 , s0, s1 // set return value -#ifdef ARM_SOFTFP_ABI vmov r0, s0 #endif #endif + sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 936ce9f60..43f2c0c0b 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vcvt.f64.f32 d2, s0 vcvt.f64.f32 d3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - +#endif cmp N, #0 ble zdot_kernel_L999 @@ -280,8 +297,11 @@ zdot_kernel_L999: vadd.f64 d0 , d0, d2 vsub.f64 d1 , d1, d3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {d0 - d1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr From e25f4c01d60b76ab97252e475abfb8fa7e65c0f9 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 19:57:28 +0530 Subject: [PATCH 18/35] arm: add softfp support in kernel/arm/nrm2_vfp*.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/KERNEL.ARMV7 | 10 +++++----- kernel/arm/nrm2_vfp.S | 7 +++++++ kernel/arm/nrm2_vfpv3.S | 9 +++++++-- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 022547c9b..be51e83b8 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -52,6 +52,11 @@ DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -91,11 +96,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SNRM2KERNEL = nrm2_vfp.S -DNRM2KERNEL = nrm2_vfp.S -CNRM2KERNEL = nrm2_vfp.S -ZNRM2KERNEL = nrm2_vfp.S - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d4829faa3..f4823b70a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,5 +1,10 @@ include $(KERNELDIR)/KERNEL.ARMV6 +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S + STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c @@ -17,11 +22,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SNRM2KERNEL = nrm2_vfpv3.S -DNRM2KERNEL = nrm2_vfpv3.S -CNRM2KERNEL = nrm2_vfpv3.S -ZNRM2KERNEL = nrm2_vfpv3.S - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index b3bd28152..16ac5a632 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -573,6 +573,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 +#endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif #endif bx lr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 7af966895..84977901d 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -503,8 +503,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 -#ifdef ARM_SOFTFP_ABI - vmov r0, s0 +#endif + +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else + vmov r0, s0 #endif #endif From 83bd547517e0a7df5b60ae1e6165c9d3528a07e4 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 20:37:40 +0530 Subject: [PATCH 19/35] arm: add softfp support in kernel/arm/swap_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/swap_vfp.S | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index be51e83b8..86d3dabaa 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -57,6 +57,11 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -96,11 +101,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - SGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 352875188..76661da79 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_X [fp, #0 ] +#define OLD_INC_X [fp, #4 ] +#define OLD_Y [fp, #8 ] +#define OLD_INC_Y [fp, #12 ] +#else +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12] +#define OLD_Y [fp, #16] +#define OLD_INC_Y [fp, #20] +#endif + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_X [fp, #16] +#define OLD_INC_X [fp, #20] +#define OLD_Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#endif // !defined(__ARM_PCS_VFP) + +#else #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] +#endif #define N r0 @@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. push {r4 , fp} add fp, sp, #8 +#if !defined(__ARM_PCS_VFP) + ldr X, OLD_X +#endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y From ebf9e9dabe0c4f8208e8dd5c8a6579fa0045450e Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 11:16:12 -0700 Subject: [PATCH 20/35] arm64: Change mtune/mcpu options for THUNDERX2T99 target --- Makefile.arm64 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 7e9df2f4b..d19e796a5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 endif From 8f83d3f961f57fb002d8c5359c32a8db50dcab5d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 00:38:44 +0530 Subject: [PATCH 21/35] arm: add softfp support in vfp gemv kernels --- kernel/arm/KERNEL.ARMV6 | 20 +++++++------- kernel/arm/KERNEL.ARMV7 | 6 ++-- kernel/arm/cgemv_n_vfp.S | 28 +++++++++++++++---- kernel/arm/cgemv_t_vfp.S | 28 +++++++++++++++---- kernel/arm/gemv_n_vfp.S | 44 +++++++++++++++++++++++++---- kernel/arm/gemv_n_vfpv3.S | 58 +++++++++++++++++++++++---------------- kernel/arm/gemv_t_vfp.S | 54 ++++++++++++++++++++++-------------- kernel/arm/gemv_t_vfpv3.S | 44 +++++++++++++++++++++++++---- kernel/arm/zgemv_n_vfp.S | 28 +++++++++++++++---- kernel/arm/zgemv_t_vfp.S | 28 +++++++++++++++---- 10 files changed, 252 insertions(+), 86 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 86d3dabaa..022a93183 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -62,6 +62,16 @@ DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -101,16 +111,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SGEMVNKERNEL = gemv_n_vfp.S -DGEMVNKERNEL = gemv_n_vfp.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S - STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index f4823b70a..0872cb8cd 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -5,6 +5,9 @@ DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S +SGEMVNKERNEL = gemv_n_vfpv3.S +DGEMVNKERNEL = gemv_n_vfpv3.S + STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c @@ -22,9 +25,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SGEMVNKERNEL = gemv_n_vfpv3.S -DGEMVNKERNEL = gemv_n_vfpv3.S - STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d2748644..4a1cd2d45 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr s0 , ALPHA_R diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f18..e1c750c85 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble cgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 385370b7f..7c154d741 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_M r0 @@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 93bf23e49..54f958b7b 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_M r0 #define AO1 r0 @@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_M, M - - + ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 816be54ff..9559d1829 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard abi -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_N r1 #define M r0 @@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index 7ae5799bc..b1d3dadf1 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_N r1 @@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043..7d5567849 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr d0 , ALPHA_R diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa0701..407026166 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble zgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N From eda9e8632ab7d94d609006612a4b760214dfa847 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:00:48 +0530 Subject: [PATCH 22/35] generic: Bug fixes in generic 4x2 and 4x4 gemm kernels --- kernel/generic/gemmkernel_4x2.c | 30 +++++----- kernel/generic/gemmkernel_4x4.c | 98 ++++++++++++++++----------------- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c index 1d15de1d7..8c784e2f1 100644 --- a/kernel/generic/gemmkernel_4x2.c +++ b/kernel/generic/gemmkernel_4x2.c @@ -154,11 +154,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; res1_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; C0 = C0+2; C1 = C1+2; @@ -190,12 +190,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; - C0 = C0+1; - C1 = C1+1; + C0 += C0+1; + C1 += C1+1; } @@ -245,10 +245,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_2 *= alpha; res0_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; C0 = C0+4; @@ -278,8 +278,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; res0_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; C0 = C0+2; @@ -306,7 +306,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL C0[0] = res0_0; - C0 = C0+1; + C0 += C0+1; } k = (bk<<0); diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c index bd67b3fc8..99bd9c1ef 100644 --- a/kernel/generic/gemmkernel_4x4.c +++ b/kernel/generic/gemmkernel_4x4.c @@ -152,25 +152,25 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_2 *= alpha; res3_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; + C1[0] += res1_0; + C1[1] += res1_1; + C1[2] += res1_2; + C1[3] += res1_3; - C2[0] = res2_0; - C2[1] = res2_1; - C2[2] = res2_2; - C2[3] = res2_3; + C2[0] += res2_0; + C2[1] += res2_1; + C2[2] += res2_2; + C2[3] += res2_3; - C3[0] = res3_0; - C3[1] = res3_1; - C3[2] = res3_2; - C3[3] = res3_3; + C3[0] += res3_0; + C3[1] += res3_1; + C3[2] += res3_2; + C3[3] += res3_3; C0 = C0+4; C1 = C1+4; @@ -230,17 +230,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_0 *= alpha; res3_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; - C2[0] = res2_0; - C2[1] = res2_1; + C2[0] += res2_0; + C2[1] += res2_1; - C3[0] = res3_0; - C3[1] = res3_1; + C3[0] += res3_0; + C3[1] += res3_1; C0 = C0+2; C1 = C1+2; @@ -283,13 +283,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; - C2[0] = res2_0; + C2[0] += res2_0; - C3[0] = res3_0; + C3[0] += res3_0; C0 = C0+1; C1 = C1+1; @@ -360,15 +360,15 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_2 *= alpha; res1_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; + C1[0] += res1_0; + C1[1] += res1_1; + C1[2] += res1_2; + C1[3] += res1_3; C0 = C0+4; C1 = C1+4; @@ -408,11 +408,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; res1_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; C0 = C0+2; C1 = C1+2; @@ -444,9 +444,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; C0 = C0+1; C1 = C1+1; @@ -499,10 +499,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_2 *= alpha; res0_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; C0 = C0+4; @@ -532,8 +532,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; res0_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; C0 = C0+2; @@ -558,7 +558,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; C0 = C0+1; From 872a11a2bfd90225d5ace725b0ec4f59bd9291f3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:05:48 +0530 Subject: [PATCH 23/35] arm: add softfp support in sgemm/strmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 4 ++-- kernel/arm/KERNEL.ARMV7 | 3 +-- kernel/arm/sgemm_kernel_4x2_vfp.S | 12 ++++++++++++ kernel/arm/sgemm_kernel_4x4_vfpv3.S | 29 +++++++++++------------------ kernel/arm/strmm_kernel_4x2_vfp.S | 13 +++++++++++++ kernel/arm/strmm_kernel_4x4_vfpv3.S | 13 +++++++++++++ 6 files changed, 52 insertions(+), 22 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 022a93183..18d9869de 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -73,6 +73,7 @@ CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S SGEMMKERNEL = ../generic/gemmkernel_4x2.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S @@ -97,6 +98,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o STRMMKERNEL = ../generic/trmmkernel_4x2.c +STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = ../generic/trmmkernel_4x2.c CGEMMONCOPY = cgemm_ncopy_2_vfp.S @@ -111,12 +113,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S DGEMMKERNEL = dgemm_kernel_4x2_vfp.S CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 0872cb8cd..e2044133d 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -11,7 +11,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c -SGEMMKERNEL = ../generic/gemmkernel_4x4.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -30,7 +30,6 @@ DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e8b44b742..1f21e5a1f 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 86198ac90..6491d3571 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 - -#ifdef ARM_SOFTFP_ABI -#define OLD_ALPHA r3 -//#define OLD_A -#else //hard #define OLD_A r3 #define OLD_ALPHA s0 -#endif /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] - -#ifndef ARM_SOFTFP_ABI #define A [fp, #-268 ] -#endif #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, #-240] @@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] -#ifdef ARM_SOFTFP_ABI -#define A [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] #define B [fp, #8 ] #define C [fp, #12 ] #define OLD_LDC [fp, #16 ] -#else //hard +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif - + #define I r0 #define J r1 #define L r2 @@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_M, M str OLD_N, N str OLD_K, K - -#ifdef ARM_SOFTFP_ABI - str OLD_ALPHA, ALPHA -#else //hard str OLD_A, A vstr OLD_ALPHA, ALPHA -#endif + sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 8f97644ec..635b1dd13 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define OLD_C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 0dd03ac85..e24d24eba 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 09bc6ebe5b26aecd405a25dad2fa2934642fc827 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:24:38 +0530 Subject: [PATCH 24/35] arm: add softfp support in dgemm/dtrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 8 ++------ kernel/arm/KERNEL.ARMV7 | 10 +++------- kernel/arm/dgemm_kernel_4x2_vfp.S | 13 ++++++++++++- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 12 ++++++++++++ kernel/arm/dtrmm_kernel_4x2_vfp.S | 13 +++++++++++++ kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 13 +++++++++++++ 6 files changed, 55 insertions(+), 14 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 18d9869de..622085b45 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -72,7 +72,6 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -SGEMMKERNEL = ../generic/gemmkernel_4x2.c SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -85,7 +84,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_4x2.c +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S @@ -97,9 +96,8 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRMMKERNEL = ../generic/trmmkernel_4x2.c STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = ../generic/trmmkernel_4x2.c +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S @@ -113,11 +111,9 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -DGEMMKERNEL = dgemm_kernel_4x2_vfp.S CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index e2044133d..63c468e66 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,8 +8,8 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_4x4.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -17,7 +17,7 @@ SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_4x4.c +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMONCOPY = dgemm_ncopy_4_vfp.S DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o @@ -25,13 +25,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S - CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 183269d1b..001a6050c 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] - +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index b14052e06..1744b54d8 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index c578d2b1e..3d6fbf8e9 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c7e455f16..c0c6a1677 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 305cd2e8b41f4daccdfa1e6631bce7f7133faf92 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:42:32 +0530 Subject: [PATCH 25/35] arm: add softfp support in cgemm/ctrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 4 ++-- kernel/arm/KERNEL.ARMV7 | 5 +++-- kernel/arm/cgemm_kernel_2x2_vfp.S | 14 ++++++++++++++ kernel/arm/cgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfp.S | 15 +++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++ 6 files changed, 63 insertions(+), 4 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 622085b45..e8fc3df73 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -98,7 +98,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o @@ -111,10 +113,8 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 63c468e66..4bfe18d1d 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -10,6 +10,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -23,12 +24,12 @@ DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S + ifeq ($(ARM_ABI),hard) -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47..512eea387 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a184..42eb53a55 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9d..95578b10a 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69..18beb4e47 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 97d671eb610de8cd73fa90923bfbed87d1d8ffef Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:54:32 +0530 Subject: [PATCH 26/35] arm: add softfp support in zgemm/ztrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 13 ++++--------- kernel/arm/KERNEL.ARMV7 | 15 +++++---------- kernel/arm/zgemm_kernel_2x2_vfp.S | 14 ++++++++++++++ kernel/arm/zgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfp.S | 15 +++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++ 6 files changed, 67 insertions(+), 19 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e8fc3df73..960dae67b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -96,25 +96,20 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S - CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S ZGEMMONCOPY = zgemm_ncopy_2_vfp.S ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ifeq ($(ARM_ABI),hard) - +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S - -endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4bfe18d1d..5e0b4cfb8 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,10 +8,6 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S - SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S @@ -25,11 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S - -ifeq ($(ARM_ABI),hard) - -ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -endif +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S + diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d2..618f09781 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792f..0fe0c1993 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b8..78d09a9c7 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc..bf72ce605 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 37efb5bc1d9b78e5e612b5aad896981d58a5d18f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 03:06:36 +0530 Subject: [PATCH 27/35] arm: Remove unnecessary files/code Since softfp code has been added to all required vfp kernels, the code for auto detection of abi is no longer required. The option to force softfp ABI on make command line by giving ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option anymore. Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed. These are longer required. Moreover these kernels has bugs. --- Makefile.system | 19 +- c_check | 16 +- kernel/generic/gemmkernel_4x2.c | 317 ------------------ kernel/generic/gemmkernel_4x4.c | 571 -------------------------------- kernel/generic/trmmkernel_4x2.c | 528 ----------------------------- 5 files changed, 8 insertions(+), 1443 deletions(-) delete mode 100644 kernel/generic/gemmkernel_4x2.c delete mode 100644 kernel/generic/gemmkernel_4x4.c delete mode 100644 kernel/generic/trmmkernel_4x2.c diff --git a/Makefile.system b/Makefile.system index 2cae5f1c9..4face0e51 100644 --- a/Makefile.system +++ b/Makefile.system @@ -487,19 +487,14 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -# If ABI is specified on command line use it. Else use the automatically detected ABI. -ifeq ($(ARM_SOFTFP_ABI),1) -ARM_ABI = softfp -else -ifeq ($(ARM_HARD_ABI),1) -ARM_ABI = hard -else -ARM_ABI=$(ARM_ABI_AUTO) +CCOMMON_OPT += -marm +FCOMMON_OPT += -marm + +# If softfp abi is mentioned on the command line, force it. +ifeq ($(ARM_SOFTFP_ABI), 1) +CCOMMON_OPT += -mfloat-abi=softfp +FCOMMON_OPT += -mfloat-abi=softfp endif -endif -export ARM_ABI_AUTO -CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) -FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) endif diff --git a/c_check b/c_check index 2e7e08cfb..20da288be 100644 --- a/c_check +++ b/c_check @@ -94,17 +94,7 @@ if ($architecture eq "mips64") { $defined = 1; } -if ($architecture eq "arm") { - $defined = 1; - $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; - if ($data ne "") { - $abi = "hard"; - } else { - $abi = "softfp"; - } -} - -if ($architecture eq "arm64") { +if (($architecture eq "arm") || ($architecture eq "arm64")) { $defined = 1; } @@ -297,10 +287,6 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; -if ($architecture eq "arm") { - print MAKEFILE "ARM_ABI_AUTO=$abi\n"; -} - $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c deleted file mode 100644 index 8c784e2f1..000000000 --- a/kernel/generic/gemmkernel_4x2.c +++ /dev/null @@ -1,317 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - for (j=0; j<(bn/2); j+=2) - { - C0 = C; - C1 = C0+ldc; - - ptrba = ba; - - for (i=0; i - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT res2_0; - FLOAT res2_1; - FLOAT res2_2; - FLOAT res2_3; - - FLOAT res3_0; - FLOAT res3_1; - FLOAT res3_2; - FLOAT res3_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - FLOAT b2; - FLOAT b3; - - - for (j=0; j - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - for (j=0; j<(bn/2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - - ptrba = ba; - - for (i=0; i Date: Wed, 5 Jul 2017 17:01:03 +0800 Subject: [PATCH 28/35] Link -lm or -lm_hard for Android ARMv7. --- Makefile.system | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index 29d3efd53..bb55dd693 100644 --- a/Makefile.system +++ b/Makefile.system @@ -493,6 +493,14 @@ else CCOMMON_OPT += -mfloat-abi=hard FCOMMON_OPT += -mfloat-abi=hard endif + +ifeq ($(OSNAME), Android) +ifeq ($(ARM_SOFTFP_ABI), 1) +EXTRALIB += -lm +else +EXTRALIB += -Wl,-lm_hard +endif +endif endif ifeq ($(ARCH), arm64) From 49e62c0e7796ccaa773591e739628846e3d8ab06 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jul 2017 17:30:12 +0200 Subject: [PATCH 29/35] fixed syrk_thread.c taken from wernsaar Stride calculation fix copied from https://github.com/wernsaar/OpenBLAS/commit/88900e1 --- driver/level3/syrk_thread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 94274be72..5f40853dc 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)i; - width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; + width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; @@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); - width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; + width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; From f02d535fdebd541b0fc21a58c8b370c744555531 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 7 Jul 2017 12:30:42 +0530 Subject: [PATCH 30/35] arm: Fix clang compilation for ARMv7 clang is not recognizing some pre-UAL VFP mnemonics like fnmacs, fnmacd, fnmuls and fnmuld. Replaced them with equivalent UAL mnemonics which are vmls.f32, vmls.f64, vnmul.f32 and vnmul.f64 respectively. --- kernel/arm/axpy_vfp.S | 8 ++-- kernel/arm/cgemm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 16 +++---- kernel/arm/cgemv_n_vfp.S | 16 +++---- kernel/arm/cgemv_t_vfp.S | 16 +++---- kernel/arm/ctrmm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 16 +++---- kernel/arm/rot_vfp.S | 72 ++++++++++++++--------------- kernel/arm/scal_vfp.S | 24 +++++----- kernel/arm/zgemm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 16 +++---- kernel/arm/zgemv_n_vfp.S | 16 +++---- kernel/arm/zgemv_t_vfp.S | 16 +++---- kernel/arm/ztrmm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 16 +++---- 15 files changed, 148 insertions(+), 148 deletions(-) diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62..4040c7da2 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -71,14 +71,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define FMAC_R1 fmacd -#define FMAC_R2 fnmacd +#define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs -#define FMAC_R2 fnmacs +#define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -90,14 +90,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd -#define FMAC_I1 fnmacd +#define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs #define FMAC_R2 fmacs -#define FMAC_I1 fnmacs +#define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47..639b713cd 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -94,42 +94,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a184..16c00ad73 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -106,10 +106,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs - #define FMAC_R2 fnmacs + #define FMAC_R1 vmls.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -136,10 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs + #define FMAC_R1 vmls.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs - #define FMAC_I2 fnmacs + #define FMAC_I1 vmls.f32 + #define FMAC_I2 vmls.f32 #endif diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d2748644..a9040e76e 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -78,42 +78,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f18..56451c5df 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -76,42 +76,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9d..50798449b 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -98,42 +98,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69..ef7e58fa4 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -93,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls - #define FMAC_R2 fnmacs + #define FMAC_R1 vnmul.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuls #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls + #define FMAC_I1 vnmul.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmuls - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls #define FMAC_I2 fmacs @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls + #define FMAC_R1 vnmul.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls - #define FMAC_I2 fnmacs + #define FMAC_I1 vnmul.f32 + #define FMAC_I2 vmls.f32 #endif diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b6..0d1067cf9 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -100,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X, { d2 } fstmiad Y, { d3 } @@ -145,7 +145,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -154,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -163,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -186,7 +186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X, { s2 } fstmias Y, { s3 } @@ -226,13 +226,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -241,13 +241,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -259,13 +259,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -274,13 +274,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -294,13 +294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -314,13 +314,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 vstr d2 , [ X, #0 ] vstr d3 , [ Y, #0 ] vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 vstr d2 , [ X, #8 ] vstr d3 , [ Y, #8 ] @@ -343,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -358,13 +358,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -376,13 +376,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -391,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -411,13 +411,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -431,13 +431,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 vstr s2 , [ X, #0 ] vstr s3 , [ Y, #0 ] vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 vstr s2 , [ X, #4 ] vstr s3 , [ Y, #4 ] diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index a8939c3a2..cc3e3b98d 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X, { d2 - d3 } @@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X, { s2 - s3 } diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d2..6aeb6c790 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -87,42 +87,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792f..10c83e356 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -106,10 +106,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd - #define FMAC_R2 fnmacd + #define FMAC_R1 vmls.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd @@ -136,10 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd + #define FMAC_R1 vmls.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd - #define FMAC_I2 fnmacd + #define FMAC_I1 vmls.f64 + #define FMAC_I2 vmls.f64 #endif diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043..cba59567d 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -79,42 +79,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa0701..b11c08086 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -77,42 +77,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b8..f412dfcfa 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -96,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc..92370bbc1 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -93,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld - #define FMAC_R2 fnmacd + #define FMAC_R1 vnmul.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuld #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld + #define FMAC_I1 vnmul.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmuld - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld #define FMAC_I2 fmacd @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld + #define FMAC_R1 vnmul.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld - #define FMAC_I2 fnmacd + #define FMAC_I1 vnmul.f64 + #define FMAC_I2 vmls.f64 #endif From c1cf62d2c030b65d8fbf37b19ca1d88fa38f7709 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jul 2017 09:45:38 +0200 Subject: [PATCH 31/35] Add sched_getcpu implementation for pre-2.6 glibc Fixes #1210, compilation on RHEL5 with affinity enabled --- driver/others/init.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 9be6f52b0..3e6176967 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -354,6 +354,24 @@ static int numa_check(void) { return common -> num_nodes; } +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 6) +int sched_getcpu(void) +{ +int cpu; +FILE *fp = NULL; +if ( (fp = fopen("/proc/self/stat", "r")) == NULL) + return -1; +if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) { + fclose (fp); + return -1; + } + fclose (fp); + return(cpu); +} +#endif +#endif + static void numa_mapping(void) { int node, cpu, core; @@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { - #ifdef DEBUG fprintf(stderr, "Shared Memory Initialization.\n"); #endif @@ -830,7 +847,7 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); common -> final_num_procs = 0; - for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; From ad2462811a4093153ea9898200ab73ef4aea6f23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jul 2017 13:15:24 +0200 Subject: [PATCH 32/35] Do not add -lpthread on Android builds (#1229) * Do not add -lpthread on Android builds * Do not add -lpthread on Android cmake builds --- cmake/os.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/os.cmake b/cmake/os.cmake index f5a75027c..e9df68d7f 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -77,7 +77,7 @@ if (CYGWIN) set(NO_EXPRECISION 1) endif () -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") if (SMP) set(EXTRALIB "${EXTRALIB} -lpthread") endif () From 4a012c3d208f7e2a1df9303a50c884970217a259 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Jul 2017 15:39:15 +0200 Subject: [PATCH 33/35] Fix unintentional fall-through cases in get_cacheinfo These appear to be unintended side effects of PR #1091, probably causing #1232 --- cpuid_x86.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index ab2ecdcaf..3733ea3ac 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -637,12 +637,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LD1.linesize = 64; break; case 0x63 : - DTB.size = 2048; - DTB.associative = 4; - DTB.linesize = 32; - LDTB.size = 4096; - LDTB.associative= 4; - LDTB.linesize = 32; + DTB.size = 2048; + DTB.associative = 4; + DTB.linesize = 32; + LDTB.size = 4096; + LDTB.associative= 4; + LDTB.linesize = 32; + break; case 0x66 : LD1.size = 8; LD1.associative = 4; @@ -675,12 +676,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LC1.associative = 8; break; case 0x76 : - ITB.size = 2048; - ITB.associative = 0; - ITB.linesize = 8; - LITB.size = 4096; - LITB.associative= 0; - LITB.linesize = 8; + ITB.size = 2048; + ITB.associative = 0; + ITB.linesize = 8; + LITB.size = 4096; + LITB.associative= 0; + LITB.linesize = 8; + break; case 0x77 : LC1.size = 16; LC1.associative = 4; From 529bfc36ec444223ba7b49717ef2d7fa12445159 Mon Sep 17 00:00:00 2001 From: Andrew Date: Wed, 12 Jul 2017 00:59:30 +0200 Subject: [PATCH 34/35] Fix write past fixed size buffer --- driver/level2/gbmv_thread.c | 2 +- driver/level2/sbmv_thread.c | 2 +- driver/level2/spmv_thread.c | 2 +- driver/level2/tbmv_thread.c | 2 +- driver/level2/tpmv_thread.c | 2 +- driver/level2/trmv_thread.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index ef9d58d76..e86b565f8 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; - BLASLONG range_m[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index a0377d638..5718c0ec9 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #endif blas_arg_t args; - blas_queue_t queue[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index f8ae3cdcd..035300841 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index bbb1c50eb..226a922e9 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 47dc1daf9..c91b52775 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 42edb83cb..0a155366c 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; From d33fc32cf30cf1262030c93fa44c72ca8ab27681 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Jul 2017 09:35:11 +0200 Subject: [PATCH 35/35] Revert "Fix unintentional fall-through cases in get_cacheinfo" --- cpuid_x86.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 3733ea3ac..ab2ecdcaf 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -637,13 +637,12 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LD1.linesize = 64; break; case 0x63 : - DTB.size = 2048; - DTB.associative = 4; - DTB.linesize = 32; - LDTB.size = 4096; - LDTB.associative= 4; - LDTB.linesize = 32; - break; + DTB.size = 2048; + DTB.associative = 4; + DTB.linesize = 32; + LDTB.size = 4096; + LDTB.associative= 4; + LDTB.linesize = 32; case 0x66 : LD1.size = 8; LD1.associative = 4; @@ -676,13 +675,12 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LC1.associative = 8; break; case 0x76 : - ITB.size = 2048; - ITB.associative = 0; - ITB.linesize = 8; - LITB.size = 4096; - LITB.associative= 0; - LITB.linesize = 8; - break; + ITB.size = 2048; + ITB.associative = 0; + ITB.linesize = 8; + LITB.size = 4096; + LITB.associative= 0; + LITB.linesize = 8; case 0x77 : LC1.size = 16; LC1.associative = 4;