Optimise sasum for POWER9
Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x.
This commit is contained in:
parent
19bdf9d52b
commit
32c7fe6bff
|
@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
|
|||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
|
|||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
|
|
Loading…
Reference in New Issue