Merge pull request #553 from wernsaar/develop

optimized some blas level1 kernels for increments != 1
This commit is contained in:
wernsaar 2015-04-24 13:57:48 +02:00
commit da0f27b9ac
4 changed files with 85 additions and 4 deletions

View File

@ -101,6 +101,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{

View File

@ -101,15 +101,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = y[iy] * x[ix] ;
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
ix += inc_x*4 ;
iy += inc_y*4 ;
temp1 += m1+m3;
temp2 += m2+m4;
i+=4 ;
}
while(i < n)
{
dot += y[iy] * x[ix] ;
temp1 += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
dot = temp1 + temp2;
return(dot);
}

View File

@ -76,9 +76,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{
#if defined(SANDYBRIDGE)
int n1 = n & -64;
BLASLONG n1 = n & -64;
#else
int n1 = n & -32;
BLASLONG n1 = n & -32;
#endif
if ( n1 )
@ -97,6 +97,29 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{

View File

@ -80,7 +80,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -32;
BLASLONG n1 = n & -32;
if ( n1 )
sdot_kernel_16(n1, x, y , &dot );
@ -99,6 +99,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{