Optimize the performance of sum by using universal intrinsics
This commit is contained in:
parent
bb74dd29db
commit
0ed1f07660
|
@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* trivial copy of asum.c with the ABS() removed *
|
* trivial copy of asum.c with the ABS() removed *
|
||||||
**************************************************************************************/
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "../simd/intrin.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
{
|
{
|
||||||
BLASLONG i=0;
|
BLASLONG i = 0;
|
||||||
FLOAT sumf = 0.0;
|
FLOAT sumf = 0.0;
|
||||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (sumf);
|
||||||
n *= inc_x;
|
n *= inc_x;
|
||||||
while(i < n)
|
if (inc_x == 1)
|
||||||
|
{
|
||||||
|
#if V_SIMD
|
||||||
|
const int vstep = v_nlanes_f32;
|
||||||
|
const int unrollx4 = n & (-vstep * 4);
|
||||||
|
const int unrollx = n & -vstep;
|
||||||
|
v_f32 vsum0 = v_zero_f32();
|
||||||
|
v_f32 vsum1 = v_zero_f32();
|
||||||
|
v_f32 vsum2 = v_zero_f32();
|
||||||
|
v_f32 vsum3 = v_zero_f32();
|
||||||
|
while (i < unrollx4)
|
||||||
|
{
|
||||||
|
vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
|
||||||
|
vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
|
||||||
|
vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
|
||||||
|
vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
|
||||||
|
i += vstep * 4;
|
||||||
|
}
|
||||||
|
vsum0 = v_add_f32(
|
||||||
|
v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
|
||||||
|
while (i < unrollx)
|
||||||
|
{
|
||||||
|
vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
|
||||||
|
i += vstep;
|
||||||
|
}
|
||||||
|
sumf = v_sum_f32(vsum0);
|
||||||
|
#else
|
||||||
|
int n1 = n & -4;
|
||||||
|
for (; i < n1; i += 4)
|
||||||
|
{
|
||||||
|
sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
while (i < n)
|
||||||
{
|
{
|
||||||
sumf += x[i];
|
sumf += x[i];
|
||||||
i += inc_x;
|
i += inc_x;
|
||||||
}
|
}
|
||||||
return(sumf);
|
return (sumf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue