Optimise sscal for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
This commit is contained in:
Matt Brown 2017-06-14 16:47:56 +10:00
parent edc97918f8
commit bd831a03a8
1 changed files with 40 additions and 40 deletions

View File

@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xscvdpspn %x3, %x3 \n\t"
"xxspltw %x3, %x3, 0 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t"
"lxvw4x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t"
"lxvw4x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t"
"lxvw4x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
"lxvd2x 34, %5, %2 \n\t"
"lxvd2x 35, %6, %2 \n\t"
"lxvd2x 36, %7, %2 \n\t"
"lxvd2x 37, %8, %2 \n\t"
"lxvd2x 38, %9, %2 \n\t"
"lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"
@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
"xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t"
"lxvw4x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t"
"lxvd2x 34, %5, %2 \n\t"
"lxvd2x 35, %6, %2 \n\t"
"xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t"
"lxvw4x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t"
"lxvd2x 36, %7, %2 \n\t"
"lxvd2x 37, %8, %2 \n\t"
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
"lxvw4x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t"
"lxvd2x 38, %9, %2 \n\t"
"lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, -128 \n\t"
"stxvw4x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n\t"
"stxvd2x 40, 0, %2 \n\t"
"stxvd2x 41, %4, %2 \n\t"
"stxvd2x 42, %5, %2 \n\t"
"stxvd2x 43, %6, %2 \n\t"
"stxvd2x 44, %7, %2 \n\t"
"stxvd2x 45, %8, %2 \n\t"
"stxvd2x 46, %9, %2 \n\t"
"stxvd2x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t"
@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
"stxvw4x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n"
"stxvd2x 40, 0, %2 \n\t"
"stxvd2x 41, %4, %2 \n\t"
"stxvd2x 42, %5, %2 \n\t"
"stxvd2x 43, %6, %2 \n\t"
"stxvd2x 44, %7, %2 \n\t"
"stxvd2x 45, %8, %2 \n\t"
"stxvd2x 46, %9, %2 \n\t"
"stxvd2x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:
@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
".p2align 5 \n"
"1: \n\t"
"stxvw4x %x3, 0, %2 \n\t"
"stxvw4x %x3, %4, %2 \n\t"
"stxvw4x %x3, %5, %2 \n\t"
"stxvw4x %x3, %6, %2 \n\t"
"stxvw4x %x3, %7, %2 \n\t"
"stxvw4x %x3, %8, %2 \n\t"
"stxvw4x %x3, %9, %2 \n\t"
"stxvw4x %x3, %10, %2 \n\t"
"stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t"
"stxvd2x %x3, %5, %2 \n\t"
"stxvd2x %x3, %6, %2 \n\t"
"stxvd2x %x3, %7, %2 \n\t"
"stxvd2x %x3, %8, %2 \n\t"
"stxvd2x %x3, %9, %2 \n\t"
"stxvd2x %x3, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"