Split the microkernel workload into chunks of 32 floats for dsdot mode to limit loss of precision

This commit is contained in:
Martin Kroeker 2017-10-22 18:18:51 +02:00 committed by GitHub
parent 28c3fa8950
commit 5e3e91d0fc
1 changed files with 22 additions and 6 deletions

View File

@ -78,7 +78,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
double dot = 0.0 ; double dot = 0.0 ;
#if defined (DSDOT)
double mydot = 0.0;
FLOAT asmdot = 0.0;
#else
FLOAT mydot=0.0; FLOAT mydot=0.0;
#endif
BLASLONG n1; BLASLONG n1;
if ( n <= 0 ) return(dot); if ( n <= 0 ) return(dot);
@ -89,9 +94,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
n1 = n & (BLASLONG)(-32); n1 = n & (BLASLONG)(-32);
if ( n1 ) if ( n1 )
#if defined(DSDOT)
{
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG n2 = 32;
while (i<n1) {
sdot_kernel_16(n2, x1, y1 , &asmdot );
mydot += (double)asmdot;
asmdot=0.;
x1+=32;
y1+=32;
i+=32;
}
}
#else
sdot_kernel_16(n1, x, y , &mydot ); sdot_kernel_16(n1, x, y , &mydot );
#endif
i = n1; i = n1;
while(i < n) while(i < n)
{ {
@ -103,11 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
i++ ; i++ ;
} }
#if defined(DSDOT)
dot+=(double)mydot;
#else
dot+=mydot; dot+=mydot;
#endif
return(dot); return(dot);