diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 346315aba..0fd7ac35f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -10,7 +10,7 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index f1573dd30..31d841ddd 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -185,8 +185,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - n1 = n >> 3 ; - n2 = n & 7 ; + if ( inc_x == 1 ) + { + n1 = n >> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } m3 = m & 3 ; m1 = m & -4 ; @@ -258,32 +267,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[4] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[5] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[6] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[7] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - } - - if ( n2 & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -301,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; } - for( i = 0; i < ( n2 & 3) ; i++) + for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x;