diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index 7b78ec61a..d4b3fbc83 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * trivial copy of asum.c with the ABS() removed * **************************************************************************************/ - #include "common.h" +#include "../simd/intrin.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - + if (n <= 0 || inc_x <= 0) + return (sumf); n *= inc_x; - while(i < n) + if (inc_x == 1) + { +#if V_SIMD + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + v_f32 vsum0 = v_zero_f32(); + v_f32 vsum1 = v_zero_f32(); + v_f32 vsum2 = v_zero_f32(); + v_f32 vsum3 = v_zero_f32(); + while (i < unrollx4) + { + vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); + vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); + vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); + vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); + i += vstep * 4; + } + vsum0 = v_add_f32( + v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); + while (i < unrollx) + { + vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); + i += vstep; + } + sumf = v_sum_f32(vsum0); +#else + int n1 = n & -4; + for (; i < n1; i += 4) + { + sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3]; + } +#endif + } + while (i < n) { sumf += x[i]; i += inc_x; } - return(sumf); + return (sumf); } - -