Merge pull request #553 from wernsaar/develop

optimized some blas level1 kernels for increments != 1
This commit is contained in:
wernsaar 2015-04-24 13:57:48 +02:00
commit da0f27b9ac
4 changed files with 85 additions and 4 deletions

View File

@ -101,6 +101,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n) while(i < n)
{ {

View File

@ -101,15 +101,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = y[iy] * x[ix] ;
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
ix += inc_x*4 ;
iy += inc_y*4 ;
temp1 += m1+m3;
temp2 += m2+m4;
i+=4 ;
}
while(i < n) while(i < n)
{ {
dot += y[iy] * x[ix] ; temp1 += y[iy] * x[ix] ;
ix += inc_x ; ix += inc_x ;
iy += inc_y ; iy += inc_y ;
i++ ; i++ ;
} }
dot = temp1 + temp2;
return(dot); return(dot);
} }

View File

@ -76,9 +76,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{ {
#if defined(SANDYBRIDGE) #if defined(SANDYBRIDGE)
int n1 = n & -64; BLASLONG n1 = n & -64;
#else #else
int n1 = n & -32; BLASLONG n1 = n & -32;
#endif #endif
if ( n1 ) if ( n1 )
@ -97,6 +97,29 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n) while(i < n)
{ {

View File

@ -80,7 +80,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
int n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 ) if ( n1 )
sdot_kernel_16(n1, x, y , &dot ); sdot_kernel_16(n1, x, y , &dot );
@ -99,6 +99,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n) while(i < n)
{ {