optimized sgemv_n_4.c
This commit is contained in:
parent
2a60c6d4b0
commit
3a5d8dbff9
|
@ -174,9 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
BLASLONG n1;
|
BLASLONG n1;
|
||||||
BLASLONG m1;
|
BLASLONG m1;
|
||||||
BLASLONG m2;
|
BLASLONG m2;
|
||||||
|
BLASLONG m3;
|
||||||
BLASLONG n2;
|
BLASLONG n2;
|
||||||
BLASLONG lda4 = 4 * lda;
|
BLASLONG lda4 = lda << 2;
|
||||||
BLASLONG lda8 = 8 * lda;
|
BLASLONG lda8 = lda << 3;
|
||||||
FLOAT xbuffer[8],*ybuffer;
|
FLOAT xbuffer[8],*ybuffer;
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if ( m < 1 ) return(0);
|
||||||
|
@ -186,18 +187,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if ( inc_x == 1 )
|
||||||
{
|
{
|
||||||
n1 = n / 8 ;
|
n1 = n >> 3 ;
|
||||||
n2 = n % 8 ;
|
n2 = n & 7 ;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
n1 = n / 4 ;
|
n1 = n >> 2 ;
|
||||||
n2 = n % 4 ;
|
n2 = n & 3 ;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m1 = m - ( m % 4 );
|
m3 = m & 3 ;
|
||||||
m2 = (m % NBMAX) - (m % 4) ;
|
m1 = m & -4 ;
|
||||||
|
m2 = (m & (NBMAX-1)) - m3 ;
|
||||||
|
|
||||||
|
|
||||||
y_ptr = y;
|
y_ptr = y;
|
||||||
|
|
||||||
|
@ -237,8 +240,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr += 8;
|
x_ptr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
for( i = 0; i < n1 ; i++)
|
if ( n2 & 4 )
|
||||||
{
|
{
|
||||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
|
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
|
@ -248,8 +251,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
a_ptr += lda4;
|
a_ptr += lda4;
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for( i = 0; i < ( n2 & 3 ) ; i++)
|
||||||
{
|
{
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
@ -296,8 +299,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
a += NB;
|
a += NB;
|
||||||
y_ptr += NB * inc_y;
|
y_ptr += NB * inc_y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( m3 == 0 ) return;
|
||||||
|
|
||||||
j=0;
|
j=0;
|
||||||
while ( j < (m % 4))
|
while ( j < m3 )
|
||||||
{
|
{
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
|
|
|
@ -58,13 +58,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
".L01LOOP%=: \n\t"
|
||||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
|
||||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||||
|
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||||
|
|
||||||
|
".align 2 \n\t"
|
||||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||||
"movups (%5,%0,4), %%xmm9 \n\t"
|
"movups (%5,%0,4), %%xmm9 \n\t"
|
||||||
"movups (%6,%0,4), %%xmm10 \n\t"
|
"movups (%6,%0,4), %%xmm10 \n\t"
|
||||||
"movups (%7,%0,4), %%xmm11 \n\t"
|
"movups (%7,%0,4), %%xmm11 \n\t"
|
||||||
|
".align 2 \n\t"
|
||||||
"mulps %%xmm12, %%xmm8 \n\t"
|
"mulps %%xmm12, %%xmm8 \n\t"
|
||||||
"mulps %%xmm13, %%xmm9 \n\t"
|
"mulps %%xmm13, %%xmm9 \n\t"
|
||||||
"mulps %%xmm14, %%xmm10 \n\t"
|
"mulps %%xmm14, %%xmm10 \n\t"
|
||||||
|
@ -78,6 +80,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"movups (%5,%8,4), %%xmm9 \n\t"
|
"movups (%5,%8,4), %%xmm9 \n\t"
|
||||||
"movups (%6,%8,4), %%xmm10 \n\t"
|
"movups (%6,%8,4), %%xmm10 \n\t"
|
||||||
"movups (%7,%8,4), %%xmm11 \n\t"
|
"movups (%7,%8,4), %%xmm11 \n\t"
|
||||||
|
".align 2 \n\t"
|
||||||
"mulps %%xmm0 , %%xmm8 \n\t"
|
"mulps %%xmm0 , %%xmm8 \n\t"
|
||||||
"mulps %%xmm1 , %%xmm9 \n\t"
|
"mulps %%xmm1 , %%xmm9 \n\t"
|
||||||
"mulps %%xmm2 , %%xmm10 \n\t"
|
"mulps %%xmm2 , %%xmm10 \n\t"
|
||||||
|
|
Loading…
Reference in New Issue