This kernel is only used on Skylake+ if the kernel with AVX512 intrinsics can't be used, but used the variable x1 incorrectly in the tail end of the loop, as it is still at the initial value instead of where x points to. This caused 55 "other error"s in the LAPACK tests (https://github.com/OpenMathLib/OpenBLAS/issues/4282) This change makes casum.c as similar as possible as zasum.c, because zasum.c does this correctly.
145 lines
3.2 KiB
C
145 lines
3.2 KiB
C
#include "common.h"
|
|
|
|
#ifndef ABS_K
|
|
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
|
|
#endif
|
|
|
|
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
|
#include "casum_microk_skylakex-2.c"
|
|
#endif
|
|
|
|
#ifndef HAVE_CASUM_KERNEL
|
|
static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
|
|
{
|
|
|
|
BLASLONG i=0;
|
|
BLASLONG n_8 = n & -8;
|
|
FLOAT *x1 = x;
|
|
FLOAT temp0, temp1, temp2, temp3;
|
|
FLOAT temp4, temp5, temp6, temp7;
|
|
FLOAT sum0 = 0.0;
|
|
FLOAT sum1 = 0.0;
|
|
FLOAT sum2 = 0.0;
|
|
FLOAT sum3 = 0.0;
|
|
FLOAT sum4 = 0.0;
|
|
|
|
while (i < n_8) {
|
|
temp0 = ABS_K(x1[0]);
|
|
temp1 = ABS_K(x1[1]);
|
|
temp2 = ABS_K(x1[2]);
|
|
temp3 = ABS_K(x1[3]);
|
|
temp4 = ABS_K(x1[4]);
|
|
temp5 = ABS_K(x1[5]);
|
|
temp6 = ABS_K(x1[6]);
|
|
temp7 = ABS_K(x1[7]);
|
|
|
|
sum0 += temp0;
|
|
sum1 += temp1;
|
|
sum2 += temp2;
|
|
sum3 += temp3;
|
|
|
|
sum0 += temp4;
|
|
sum1 += temp5;
|
|
sum2 += temp6;
|
|
sum3 += temp7;
|
|
|
|
x1+=8;
|
|
i+=4;
|
|
}
|
|
|
|
while (i < n) {
|
|
sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
|
|
x1 += 2;
|
|
i++;
|
|
}
|
|
|
|
return sum0+sum1+sum2+sum3+sum4;
|
|
}
|
|
|
|
#endif
|
|
|
|
static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|
{
|
|
BLASLONG i = 0;
|
|
BLASLONG ip = 0;
|
|
BLASLONG inc_x2;
|
|
FLOAT sumf = 0.0;
|
|
|
|
if (n <= 0 || inc_x <= 0) return(sumf);
|
|
if (inc_x == 1) {
|
|
sumf = casum_kernel(n, x);
|
|
}
|
|
else {
|
|
inc_x2 = 2 * inc_x;
|
|
|
|
while (i < n) {
|
|
sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
|
|
ip += inc_x2;
|
|
i++;
|
|
}
|
|
}
|
|
|
|
return(sumf);
|
|
}
|
|
|
|
#if defined(SMP)
|
|
static int asum_thread_function(BLASLONG n,
|
|
BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
|
|
FLOAT *x, BLASLONG inc_x,
|
|
FLOAT * dummy3, BLASLONG dummy4,
|
|
FLOAT * result, BLASLONG dummy5)
|
|
{
|
|
*(FLOAT *) result = asum_compute(n, x, inc_x);
|
|
return 0;
|
|
}
|
|
|
|
extern int blas_level1_thread_with_return_value(int mode,
|
|
BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
|
|
void *a, BLASLONG lda,
|
|
void *b, BLASLONG ldb,
|
|
void *c, BLASLONG ldc,
|
|
int (*function)(),
|
|
int nthread);
|
|
#endif
|
|
|
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|
{
|
|
#if defined(SMP)
|
|
int nthreads;
|
|
FLOAT dummy_alpha[2];
|
|
#endif
|
|
FLOAT sumf = 0.0;
|
|
|
|
#if defined(SMP)
|
|
int num_cpu = num_cpu_avail(1);
|
|
if (n <= 10000 || inc_x <= 0)
|
|
nthreads = 1;
|
|
else
|
|
nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
|
|
|
|
if (nthreads == 1) {
|
|
sumf = asum_compute(n, x, inc_x);
|
|
}
|
|
else {
|
|
int mode, i;
|
|
char result[MAX_CPU_NUMBER * sizeof(double) *2];
|
|
FLOAT *ptr;
|
|
#if !defined(DOUBLE)
|
|
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
|
#else
|
|
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
|
#endif
|
|
blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
|
|
NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
|
|
ptr = (FLOAT *)result;
|
|
for (i = 0; i < nthreads; i++) {
|
|
sumf += (*ptr);
|
|
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
|
|
}
|
|
}
|
|
#else
|
|
sumf = asum_compute(n, x, inc_x);
|
|
#endif
|
|
return(sumf);
|
|
}
|