small optimizations for zgemv kernels

This commit is contained in:
wernsaar 2013-11-23 12:35:31 +01:00
parent 9a0f978929
commit 33d3ab6e09
2 changed files with 101 additions and 60 deletions

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/************************************************************************************** /**************************************************************************************
* * 2013/09/15 Saar * * 2013/11/23 Saar
* * BLASTEST float : OK * * BLASTEST float : OK
* * BLASTEST double : OK * * BLASTEST double : OK
* CTEST : OK * CTEST : OK
@ -48,20 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG lda2; BLASLONG lda2;
BLASLONG i2; BLASLONG i2;
if( alpha_r == 0.0 && alpha_i == 0.0 ) return(0);
lda2 = 2*lda; lda2 = 2*lda;
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
ix = 0; ix = 0;
a_ptr = a; a_ptr = a;
#if !defined(CONJ) if ( inc_x == 1 && inc_y == 1 )
for (j=0; j<n; j++)
{ {
for (j=0; j<n; j++)
{
#if !defined(XCONJ) #if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
@ -70,9 +67,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif #endif
iy = 0; iy = 0;
i2=0;
for (i=0; i<m; i++) for (i=0; i<m; i++)
{ {
i2 = 2*i; #if !defined(CONJ)
#if !defined(XCONJ) #if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
@ -81,13 +81,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif #endif
iy += inc_y2; #else
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif
#endif
i2 += 2;
iy += 2;
} }
a_ptr += lda2; a_ptr += lda2;
ix += inc_x2; ix += 2;
}
return(0);
} }
#else
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (j=0; j<n; j++) for (j=0; j<n; j++)
{ {
@ -99,9 +118,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif #endif
iy = 0; iy = 0;
i2=0;
for (i=0; i<m; i++) for (i=0; i<m; i++)
{ {
i2 = 2*i; #if !defined(CONJ)
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif
#else
#if !defined(XCONJ) #if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
@ -110,6 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif #endif
#endif
i2 += 2;
iy += inc_y2; iy += inc_y2;
} }
a_ptr += lda2; a_ptr += lda2;
@ -117,8 +151,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
} }
#endif
return(0); return(0);
} }

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/************************************************************************************** /**************************************************************************************
* * 2013/09/15 Saar * * 2013/11/23 Saar
* * BLASTEST float : OK * * BLASTEST float : OK
* * BLASTEST double : OK * * BLASTEST double : OK
* CTEST : OK * CTEST : OK
@ -48,32 +48,75 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG lda2; BLASLONG lda2;
BLASLONG i2; BLASLONG i2;
if( alpha_r == 0.0 && alpha_i == 0.0 ) return(0);
lda2 = 2*lda; lda2 = 2*lda;
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
iy = 0; iy = 0;
a_ptr = a; a_ptr = a;
#if !defined(CONJ) if ( inc_x == 1 && inc_y == 1 )
for (j=0; j<n; j++)
{ {
for (j=0; j<n; j++)
{
temp_r = 0.0; temp_r = 0.0;
temp_i = 0.0; temp_i = 0.0;
ix = 0; ix = 0;
i2=0;
for (i=0; i<m; i++) for (i=0; i<m; i++)
{ {
i2 = 2*i;
#if !defined(XCONJ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1]; temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix]; temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else #else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1]; temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix]; temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif #endif
i2 += 2;
ix += 2;
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda2;
iy += 2;
}
return(0);
}
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;
for (i=0; i<m; i++)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif
i2 += 2;
ix += inc_x2; ix += inc_x2;
} }
@ -89,40 +132,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
iy += inc_y2; iy += inc_y2;
} }
#else
for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
for (i=0; i<m; i++)
{
i2 = 2*i;
#if !defined(XCONJ)
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#endif
ix += inc_x2;
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda2;
iy += inc_y2;
}
#endif
return(0); return(0);
} }