Merge pull request #4807 from martin-frbg/scalfixes

[WIP]Make NAN handling in the SCAL kernels depend on the dummy2 parameter
This commit is contained in:
Martin Kroeker 2024-07-25 23:42:50 +02:00 committed by GitHub
commit fb7c53c5e5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 151 additions and 29 deletions

View File

@ -43,6 +43,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (n <= 0) || (inc_x <= 0)) if ( (n <= 0) || (inc_x <= 0))
return(0); return(0);
if (dummy2 == 0) {
while(j < n)
{
if ( da == 0.0 )
x[i]=0.0;
else
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
} else {
while(j < n) while(j < n)
{ {
@ -60,6 +73,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j++; j++;
} }
}
return 0; return 0;
} }

View File

@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define X_COPY x5 /* X vector address */ #define X_COPY x5 /* X vector address */
#define INC_X x4 /* X stride */ #define INC_X x4 /* X stride */
#define I x1 /* loop variable */ #define I x1 /* loop variable */
#define FLAG x9
/******************************************************************************* /*******************************************************************************
* Macro definitions * Macro definitions
*******************************************************************************/ *******************************************************************************/
@ -168,9 +168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, xzr cmp N, xzr
ble .Lscal_kernel_L999 ble .Lscal_kernel_L999
//fcmp DA, #0.0 ldr FLAG, [sp]
//beq .Lscal_kernel_zero cmp FLAG, #1
beq .Lscal_kernel_nansafe
fcmp DA, #0.0
beq .Lscal_kernel_zero
.Lscal_kernel_nansafe:
cmp INC_X, #1 cmp INC_X, #1
bne .Lscal_kernel_S_BEGIN bne .Lscal_kernel_S_BEGIN

View File

@ -73,6 +73,15 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)
for( i=0; i<n; i+=8 ) for( i=0; i<n; i+=8 )
{ {
x[0] = alpha;
x[1] = alpha;
x[2] = alpha;
x[3] = alpha;
x[4] = alpha;
x[5] = alpha;
x[6] = alpha;
x[7] = alpha;
#if 0
if(isfinite(x[0])) if(isfinite(x[0]))
x[0] = alpha; x[0] = alpha;
else else
@ -106,6 +115,7 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)
else else
x[7] = NAN; x[7] = NAN;
x+=8; x+=8;
#endif
} }
} }
@ -130,6 +140,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( n >= 16 ) if ( n >= 16 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
if (dummy2 == 0)
for (j = 0; j < align; j++) {
x [j] = 0.0;
}
else
for (j = 0; j < align; j++) { for (j = 0; j < align; j++) {
if (isfinite(x[j])) if (isfinite(x[j]))
x[j] = 0.0; x[j] = 0.0;
@ -151,7 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j=n1; j=n1;
} }
#endif #endif
if (dummy2 == 0)
while(j < n)
{
x[j]=0.0;
j++;
}
else
while(j < n) while(j < n)
{ {
if (!isfinite(x[j])) if (!isfinite(x[j]))
@ -202,7 +223,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 ) if ( da == 0.0 )
{ {
if (dummy2 == 0)
while(j < n)
{
x[i]=0.0;
i += inc_x;
j++;
}
else
while(j < n) while(j < n)
{ {
if (!isfinite(x[i])) if (!isfinite(x[i]))

View File

@ -47,9 +47,11 @@
#ifndef __64BIT__ #ifndef __64BIT__
#define X r6 #define X r6
#define INCX r7 #define INCX r7
#define FLAG r11
#else #else
#define X r7 #define X r7
#define INCX r8 #define INCX r8
#define FLAG r12
#endif #endif
#endif #endif
@ -57,9 +59,11 @@
#if !defined(__64BIT__) && defined(DOUBLE) #if !defined(__64BIT__) && defined(DOUBLE)
#define X r8 #define X r8
#define INCX r9 #define INCX r9
#define FLAG r13
#else #else
#define X r7 #define X r7
#define INCX r8 #define INCX r8
#define FLAG r12
#endif #endif
#endif #endif
@ -84,9 +88,12 @@
cmpwi cr0, N, 0 cmpwi cr0, N, 0
blelr- cr0 blelr- cr0
// fcmpu cr0, FZERO, ALPHA fcmpu cr0, FZERO, ALPHA
// bne- cr0, LL(A1I1) bne- cr0, LL(A1I1)
b LL(A1I1)
ld FLAG, 48+64+8(SP)
cmpwi cr0, FLAG, 1
beq- cr0, LL(A1I1)
cmpwi cr0, INCX, SIZE cmpwi cr0, INCX, SIZE
bne- cr0, LL(A0IN) bne- cr0, LL(A0IN)

View File

@ -74,6 +74,23 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )
for( i=0; i<n; i+=8 ) for( i=0; i<n; i+=8 )
{ {
x[0] = alpha;
x[1] = alpha;
x[2] = alpha;
x[3] = alpha;
x[4] = alpha;
x[5] = alpha;
x[6] = alpha;
x[7] = alpha;
x[8] = alpha;
x[9] = alpha;
x[10] = alpha;
x[11] = alpha;
x[12] = alpha;
x[13] = alpha;
x[14] = alpha;
x[15] = alpha;
#if 0
if (isfinite(x[0])) if (isfinite(x[0]))
x[0] = alpha; x[0] = alpha;
else else
@ -107,6 +124,7 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )
else else
x[7] = NAN; x[7] = NAN;
x+=8; x+=8;
#endif
} }
} }
@ -132,6 +150,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( n >= 32 ) if ( n >= 32 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
if (dummy2 == 0)
for (j = 0; j < align; j++){
x[j] = 0.0;
}
else
for (j = 0; j < align; j++) { for (j = 0; j < align; j++) {
if (isfinite(x[j])) if (isfinite(x[j]))
x[j] = 0.0; x[j] = 0.0;
@ -153,7 +176,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j=n1; j=n1;
} }
#endif #endif
if (dummy2 == 0)
while(j < n)
{
x[j] = 0.0;
j++;
}
else
while(j < n) while(j < n)
{ {
if (isfinite(x[j])) if (isfinite(x[j]))
@ -204,7 +233,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 ) if ( da == 0.0 )
{ {
if (dummy2 == 0)
while(j < n)
{
x[i]=0.0;
i += inc_x;
j++;
}
else
while(j < n) while(j < n)
{ {
if (isfinite(x[i])) if (isfinite(x[i]))

View File

@ -43,7 +43,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (n <= 0) || (inc_x <= 0)) if ( (n <= 0) || (inc_x <= 0))
return(0); return(0);
if (dummy2 == 0) {
while(j < n) while(j < n)
{ {
@ -57,7 +57,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
i += inc_x ; i += inc_x ;
j++; j++;
}
} else {
while(j < n)
{
if ( da == 0.0 )
x[i]=0.0;
else
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
} }
return 0; return 0;

View File

@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
FLOAT_V_T v0; FLOAT_V_T v0;
if(inc_x == 1) { if(inc_x == 1) {
if(da == 0.0) { if(dummy2 == 0 && da == 0.0) {
int gvl = VSETVL_MAX; int gvl = VSETVL_MAX;
v0 = VFMVVF_FLOAT(0.0, gvl); v0 = VFMVVF_FLOAT(0.0, gvl);
for (size_t vl; n > 0; n -= vl, x += vl) { for (size_t vl; n > 0; n -= vl, x += vl) {
@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} else { } else {
BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_x = inc_x * sizeof(FLOAT);
if(da == 0.0) { if(dummy2 == 0 && da == 0.0) {
int gvl = VSETVL_MAX; int gvl = VSETVL_MAX;
v0 = VFMVVF_FLOAT(0.0, gvl); v0 = VFMVVF_FLOAT(0.0, gvl);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {

View File

@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
FLOAT_V_T v0, v1; FLOAT_V_T v0, v1;
unsigned int gvl = 0; unsigned int gvl = 0;
if(inc_x == 1){ if(inc_x == 1){
if (0){ //if(da == 0.0){ if(dummy2 == 0 && da == 0.0){
memset(&x[0], 0, n * sizeof(FLOAT)); memset(&x[0], 0, n * sizeof(FLOAT));
}else{ }else{
gvl = VSETVL(n); gvl = VSETVL(n);
@ -96,7 +96,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
} }
}else{ }else{
if (0) { //if(da == 0.0){ if(dummy2 == 0 && da == 0.0){
BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG ix = 0; BLASLONG ix = 0;
gvl = VSETVL(n); gvl = VSETVL(n);

View File

@ -57,19 +57,24 @@
#ifdef XDOUBLE #ifdef XDOUBLE
movl 44(%esp),%edi movl 44(%esp),%edi
movl 48(%esp),%esi movl 48(%esp),%esi
movl 64(%esp),%ecx
#elif defined(DOUBLE) #elif defined(DOUBLE)
movl 36(%esp),%edi movl 36(%esp),%edi
movl 40(%esp),%esi movl 40(%esp),%esi
movl 56(%esp),%ecx
#else #else
movl 32(%esp),%edi movl 32(%esp),%edi
movl 36(%esp),%esi movl 36(%esp),%esi
movl 54(%esp),%ecx
#endif #endif
ftst ftst
fnstsw %ax fnstsw %ax
andb $68, %ah andb $68, %ah
// je .L300 # Alpha != ZERO je .L300 # Alpha != ZERO
jmp .L300
cmpl $1,%ecx # dummy2 flag
je .L300
/* Alpha == ZERO */ /* Alpha == ZERO */
cmpl $1,%esi cmpl $1,%esi

View File

@ -60,8 +60,10 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
movq 64(%rsp), %r9
movaps %xmm3, %xmm0 movaps %xmm3, %xmm0
#else
movq 24(%rsp), %r9
#endif #endif
SAVEREGISTERS SAVEREGISTERS
@ -73,6 +75,10 @@
lea (, INCX, SIZE), INCX lea (, INCX, SIZE), INCX
comisd %xmm0, %xmm1 comisd %xmm0, %xmm1
jne .L100 jne .L100
jp .L100
cmpq $1, %r9
je .L100
/* Alpha == ZERO */ /* Alpha == ZERO */
cmpq $SIZE, INCX cmpq $SIZE, INCX

View File

@ -60,8 +60,10 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
movq 64(%rsp), %r9
movaps %xmm3, %xmm0 movaps %xmm3, %xmm0
#else
movq 24(%rsp), %r9
#endif #endif
SAVEREGISTERS SAVEREGISTERS
@ -76,6 +78,8 @@
shufps $0, %xmm0, %xmm0 shufps $0, %xmm0, %xmm0
jne .L100 # Alpha != ZERO jne .L100 # Alpha != ZERO
cmpq $1, %r9
je .L100 je .L100
/* Alpha == ZERO */ /* Alpha == ZERO */
cmpq $SIZE, INCX cmpq $SIZE, INCX

View File

@ -48,6 +48,7 @@
#define X ARG2 #define X ARG2
#define INCX ARG3 #define INCX ARG3
#endif #endif
#define FLAG %r9
#define XX %r10 #define XX %r10
#define I %rax #define I %rax
@ -60,8 +61,10 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
movq 64(%rsp), FLAG
movaps %xmm3, %xmm0 movaps %xmm3, %xmm0
#else
movq 24(%rsp), FLAG
#endif #endif
SAVEREGISTERS SAVEREGISTERS
@ -75,6 +78,8 @@
comisd %xmm0, %xmm1 comisd %xmm0, %xmm1
jne .L100 # Alpha != ZERO jne .L100 # Alpha != ZERO
jp .L100 # For Alpha = NaN jp .L100 # For Alpha = NaN
cmpq $1, FLAG
je .L100 # disable the Alpha=zero path as it does not handle x=inf or nan je .L100 # disable the Alpha=zero path as it does not handle x=inf or nan
/* Alpha == ZERO */ /* Alpha == ZERO */
cmpq $SIZE, INCX cmpq $SIZE, INCX

View File

@ -74,7 +74,7 @@
pxor %xmm15, %xmm15 pxor %xmm15, %xmm15
comisd %xmm0, %xmm15 comisd %xmm0, %xmm15
jne .L30 # Alpha_r != ZERO jne .L30 # Alpha_r != ZERO
jp .L30
comisd %xmm1, %xmm15 comisd %xmm1, %xmm15
jne .L30 # Alpha_i != ZERO jne .L30 # Alpha_i != ZERO

View File

@ -76,7 +76,7 @@
pxor %xmm15, %xmm15 pxor %xmm15, %xmm15
comiss %xmm0, %xmm15 comiss %xmm0, %xmm15
jne .L100 # Alpha_r != ZERO jne .L100 # Alpha_r != ZERO
jp .L100 # Alpha_r == NAN
comiss %xmm1, %xmm15 comiss %xmm1, %xmm15
jne .L100 # Alpha_i != ZERO jne .L100 # Alpha_i != ZERO