Merge pull request #1329 from martin-frbg/dsdot

(Trivial) optimized dsdot implementation for HASWELL
This commit is contained in:
Martin Kroeker 2017-10-25 19:13:38 +02:00 committed by GitHub
commit ab87ee6b48
3 changed files with 42 additions and 5 deletions

View File

@ -24,6 +24,8 @@ DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSDOTKERNEL = sdot.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c

View File

@ -68,13 +68,22 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
#endif
#if defined (DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
#if defined (DSDOT)
double mydot = 0.0;
FLOAT asmdot = 0.0;
#else
FLOAT mydot=0.0;
#endif
BLASLONG n1;
if ( n <= 0 ) return(dot);
@ -85,17 +94,35 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
n1 = n & (BLASLONG)(-32);
if ( n1 )
#if defined(DSDOT)
{
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG n2 = 32;
while (i<n1) {
sdot_kernel_16(n2, x1, y1 , &asmdot );
mydot += (double)asmdot;
asmdot=0.;
x1+=32;
y1+=32;
i+=32;
}
}
#else
sdot_kernel_16(n1, x, y , &mydot );
#endif
i = n1;
while(i < n)
{
#if defined(DSDOT)
dot += (double)y[i] * (double)x[i] ;
#else
dot += y[i] * x[i] ;
#endif
i++ ;
}
dot+=mydot;
return(dot);
@ -106,8 +133,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
while(i < n1)
{
#if defined (DSDOT)
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
#else
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
#endif
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
@ -116,8 +146,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
while(i < n)
{
#if defined (DSDOT)
dot += (double)y[iy] * (double)x[ix] ;
#else
dot += y[iy] * x[ix] ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;

View File

@ -53,9 +53,11 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y
"vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y
#ifndef DSDOT
"addq $32 , %0 \n\t"
"subq $32 , %1 \n\t"
"jnz 1b \n\t"
#endif
"vextractf128 $1 , %%ymm4 , %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5 , %%xmm13 \n\t"