Add data prefetch in DOT and ASUM functions

Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
This commit is contained in:
kaustubh
2016-11-22 11:21:03 +05:30
parent d4da3fbe9f
commit 00abce3b93
8 changed files with 961 additions and 1237 deletions

View File

@@ -28,7 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include "macros_msa.h"
/* return float, x,y float */
#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
@@ -37,96 +36,71 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i = 0;
double dot = 0.0;
float x0, x1, x2, x3, y0, y1, y2, y3;
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
v4f32 dot0 = {0, 0, 0, 0};
v4f32 dot1 = {0, 0, 0, 0};
v4f32 dot2 = {0, 0, 0, 0};
v4f32 dot3 = {0, 0, 0, 0};
if (n < 0) return (dot);
if (n < 1) return (dot);
if ((1 == inc_x) && (1 == inc_y))
{
for (i = (n >> 5); i--;)
{
LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 256(%[x])\n\t"
"pref 0, 288(%[x])\n\t"
"pref 0, 320(%[x])\n\t"
"pref 0, 352(%[x])\n\t"
"pref 0, 256(%[y])\n\t"
"pref 0, 288(%[y])\n\t"
"pref 0, 320(%[y])\n\t"
"pref 0, 352(%[y])\n\t"
: : [x] "r" (x), [y] "r" (y)
);
#endif
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot1 += (vy1 * vx1);
dot2 += (vy2 * vx2);
dot3 += (vy3 * vx3);
dot0 += (vy4 * vx4);
dot0 += (vy5 * vx5);
dot0 += (vy6 * vx6);
dot0 += (vy7 * vx7);
dot1 += (vy5 * vx5);
dot2 += (vy6 * vx6);
dot3 += (vy7 * vx7);
}
if (n & 31)
{
if ((n & 16) && (n & 8) && (n & 4))
if (n & 16)
{
LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
dot0 += (vy5 * vx5);
dot0 += (vy6 * vx6);
dot1 += (vy1 * vx1);
dot2 += (vy2 * vx2);
dot3 += (vy3 * vx3);
}
else if ((n & 16) && (n & 8))
if (n & 8)
{
LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5);
LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5);
LD_SP2_INC(x, 4, vx0, vx1);
LD_SP2_INC(y, 4, vy0, vy1);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
dot0 += (vy5 * vx5);
dot1 += (vy1 * vx1);
}
else if ((n & 16) && (n & 4))
{
LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4);
LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
}
else if ((n & 8) && (n & 4))
{
LD_SP3_INC(x, 4, vx0, vx1, vx2);
LD_SP3_INC(y, 4, vy0, vy1, vy2);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
}
else if (n & 16)
{
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
}
else if (n & 8)
{
LD_SP2_INC(x, 4, vx0, vx1);
LD_SP2_INC(y, 4, vy0, vy1);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
}
else if (n & 4)
if (n & 4)
{
vx0 = LD_SP(x); x += 4;
vy0 = LD_SP(y); y += 4;
@@ -134,16 +108,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot0 += (vy0 * vx0);
}
if ((n & 2) && (n & 1))
{
LD_GP3_INC(x, 1, x0, x1, x2);
LD_GP3_INC(y, 1, y0, y1, y2);
dot += (y0 * x0);
dot += (y1 * x1);
dot += (y2 * x2);
}
else if (n & 2)
if (n & 2)
{
LD_GP2_INC(x, 1, x0, x1);
LD_GP2_INC(y, 1, y0, y1);
@@ -151,7 +116,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot += (y0 * x0);
dot += (y1 * x1);
}
else if (n & 1)
if (n & 1)
{
x0 = *x;
y0 = *y;
@@ -160,6 +126,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
}
dot0 += dot1 + dot2 + dot3;
dot += dot0[0];
dot += dot0[1];
dot += dot0[2];
@@ -178,16 +146,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot += (y3 * x3);
}
if ((n & 2) && (n & 1))
{
LD_GP3_INC(x, inc_x, x0, x1, x2);
LD_GP3_INC(y, inc_y, y0, y1, y2);
dot += (y0 * x0);
dot += (y1 * x1);
dot += (y2 * x2);
}
else if (n & 2)
if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(y, inc_y, y0, y1);
@@ -195,7 +154,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot += (y0 * x0);
dot += (y1 * x1);
}
else if (n & 1)
if (n & 1)
{
x0 = *x;
y0 = *y;