Optimize s/drot function for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
This commit is contained in:
Rajalakshmi Srinivasaraghavan
2021-01-21 13:24:45 -06:00
parent d6cf67778c
commit 439b93f6d2
4 changed files with 341 additions and 2 deletions

View File

@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "drot_microk_power8.c"
#elif defined(POWER10)
#include "drot_microk_power10.c"
#endif
#endif
@@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
}
}
BLASLONG n1 = (n-i) & -16;
if ( n1 > 0 )
{
drot_kernel_16(n1,&x[i], &y[i], c, s);
i+=n1;
}
#else
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
drot_kernel_16(n1, x1, y1, c, s);
i=n1;
}
#endif
while(i < n)
{