diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 65955f33d..10cc573db 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -101,6 +101,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + while(i < n) { diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index c17741cc3..0f77d5fbc 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -101,15 +101,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + while(i < n) { - dot += y[iy] * x[ix] ; + temp1 += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } + dot = temp1 + temp2; return(dot); } diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 0d2a2923c..b37e24d9b 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -76,9 +76,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { #if defined(SANDYBRIDGE) - int n1 = n & -64; + BLASLONG n1 = n & -64; #else - int n1 = n & -32; + BLASLONG n1 = n & -32; #endif if ( n1 ) @@ -97,6 +97,29 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + + } + + while(i < n) { diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index c14659013..a6da1fea7 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -80,7 +80,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -32; + BLASLONG n1 = n & -32; if ( n1 ) sdot_kernel_16(n1, x, y , &dot ); @@ -99,6 +99,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + while(i < n) {