Merge pull request #1329 from martin-frbg/dsdot
(Trivial) optimized dsdot implementation for HASWELL
This commit is contained in:
commit
ab87ee6b48
|
@ -24,6 +24,8 @@ DDOTKERNEL = ddot.c
|
|||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
|
||||
DSDOTKERNEL = sdot.c
|
||||
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
|
|
|
@ -68,13 +68,22 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
|||
|
||||
#endif
|
||||
|
||||
#if defined (DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
#if defined (DSDOT)
|
||||
double mydot = 0.0;
|
||||
FLOAT asmdot = 0.0;
|
||||
#else
|
||||
FLOAT mydot=0.0;
|
||||
#endif
|
||||
BLASLONG n1;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
@ -85,17 +94,35 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
n1 = n & (BLASLONG)(-32);
|
||||
|
||||
if ( n1 )
|
||||
#if defined(DSDOT)
|
||||
{
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG n2 = 32;
|
||||
while (i<n1) {
|
||||
sdot_kernel_16(n2, x1, y1 , &asmdot );
|
||||
mydot += (double)asmdot;
|
||||
asmdot=0.;
|
||||
x1+=32;
|
||||
y1+=32;
|
||||
i+=32;
|
||||
}
|
||||
}
|
||||
#else
|
||||
sdot_kernel_16(n1, x, y , &mydot );
|
||||
|
||||
|
||||
#endif
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += (double)y[i] * (double)x[i] ;
|
||||
#else
|
||||
dot += y[i] * x[i] ;
|
||||
#endif
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
dot+=mydot;
|
||||
return(dot);
|
||||
|
||||
|
@ -106,8 +133,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
#if defined (DSDOT)
|
||||
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
|
||||
#else
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
#endif
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
@ -116,8 +146,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
#if defined (DSDOT)
|
||||
dot += (double)y[iy] * (double)x[ix] ;
|
||||
#else
|
||||
dot += y[iy] * x[ix] ;
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
|
|
@ -53,9 +53,11 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y
|
||||
"vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y
|
||||
|
||||
#ifndef DSDOT
|
||||
"addq $32 , %0 \n\t"
|
||||
"subq $32 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#endif
|
||||
|
||||
"vextractf128 $1 , %%ymm4 , %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5 , %%xmm13 \n\t"
|
||||
|
|
Loading…
Reference in New Issue