Fix casum fallback kernel.

This kernel is only used on Skylake+ if the kernel with AVX512
intrinsics can't be used, but used the variable x1 incorrectly
in the tail end of the loop, as it is still at the initial
value instead of where x points to.

This caused 55 "other error"s in the LAPACK tests
(https://github.com/OpenMathLib/OpenBLAS/issues/4282)

This change makes casum.c as similar as possible as zasum.c,
because zasum.c does this correctly.
This commit is contained in:
Bart Oldeman 2023-11-17 23:49:34 +00:00
parent cb2950709f
commit f8ad5344c2
1 changed files with 12 additions and 12 deletions

View File

@ -9,12 +9,12 @@
#endif
#ifndef HAVE_CASUM_KERNEL
static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
{
BLASLONG i=0;
BLASLONG n_8 = n & -8;
FLOAT *x = x1;
FLOAT *x1 = x;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
FLOAT sum4 = 0.0;
while (i < n_8) {
temp0 = ABS_K(x[0]);
temp1 = ABS_K(x[1]);
temp2 = ABS_K(x[2]);
temp3 = ABS_K(x[3]);
temp4 = ABS_K(x[4]);
temp5 = ABS_K(x[5]);
temp6 = ABS_K(x[6]);
temp7 = ABS_K(x[7]);
temp0 = ABS_K(x1[0]);
temp1 = ABS_K(x1[1]);
temp2 = ABS_K(x1[2]);
temp3 = ABS_K(x1[3]);
temp4 = ABS_K(x1[4]);
temp5 = ABS_K(x1[5]);
temp6 = ABS_K(x1[6]);
temp7 = ABS_K(x1[7]);
sum0 += temp0;
sum1 += temp1;
@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
sum2 += temp6;
sum3 += temp7;
x+=8;
x1+=8;
i+=4;
}
while (i < n) {
sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
x1 += 2;
i++;
}