diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 943dcdefa..ee762ffce 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -44,12 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_4x8 -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; FLOAT *b0,*b1,*b2,*b3; FLOAT *x4; + FLOAT x[8]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; @@ -60,6 +61,9 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON b3 = a3 + lda4 ; x4 = x + 4; + for ( i=0; i<8; i++) + x[i] = xo[i] * *alpha; + for ( i=0; i< n; i+=4 ) { @@ -81,15 +85,19 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON #ifndef HAVE_KERNEL_4x4 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; @@ -101,32 +109,147 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +#ifndef HAVE_KERNEL_4x2 + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%2) , %%xmm12 \n\t" // x0 + "movss (%6) , %%xmm4 \n\t" // alpha + "movss 4(%2) , %%xmm13 \n\t" // x1 + "mulss %%xmm4 , %%xmm12 \n\t" // alpha + "mulss %%xmm4 , %%xmm13 \n\t" // alpha + "shufps $0, %%xmm12, %%xmm12 \n\t" + "shufps $0, %%xmm13, %%xmm13 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "movups (%4,%0,4), %%xmm8 \n\t" + "movups (%5,%0,4), %%xmm9 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm4 \n\t" + + "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y + + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "movss (%2), %%xmm12 \n\t" // x0 + "mulss (%6), %%xmm12 \n\t" // alpha + "shufps $0, %%xmm12, %%xmm12 \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a + "movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm12, %%xmm9 \n\t" + "addps %%xmm4 , %%xmm8 \n\t" + "addps %%xmm5 , %%xmm9 \n\t" + + "addq $8 , %0 \n\t" + "movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y + "movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y + + "subq $8 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "testq $0x04, %5 \n\t" + "jz .L08LABEL%= \n\t" + + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (n2), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + } - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline)); -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) +#endif + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; if ( inc_dest != 1 ) { - FLOAT da = *alpha; for ( i=0; i