Merge pull request #4807 from martin-frbg/scalfixes

[WIP]Make NAN handling in the SCAL kernels depend on the dummy2 parameter
2024-07-25 23:42:50 +02:00 · 2024-07-25 23:42:50 +02:00 · fb7c53c5e5
parent 15c53dd2e0 b613754143
commit fb7c53c5e5
14 changed files with 151 additions and 29 deletions
--- a/kernel/arm/scal.c
+++ b/kernel/arm/scal.c
@ -43,9 +43,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (n <= 0) || (inc_x <= 0))
 		return(0);
 	
+	if (dummy2 == 0) {
+		while(j < n)
+		{

-	while(j < n)
-	{
+		if ( da == 0.0 )
+			x[i]=0.0;
+		else
+			x[i] = da * x[i] ;
+
+		i += inc_x ;
+		j++;
+		}
+	} else {
+	
+		while(j < n)
+		{

 		if ( da == 0.0 )
 			if (!isnan(x[i]) && !isinf(x[i])) {
@ -59,6 +72,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		i += inc_x ;
 		j++;

+		}
 	}
 	return 0;

--- a/kernel/arm64/scal.S
+++ b/kernel/arm64/scal.S
@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	X_COPY	x5	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define I	x1	/* loop variable */
-
+#define FLAG    x9
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
@ -168,9 +168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, xzr
 	ble	.Lscal_kernel_L999

-	//fcmp	DA, #0.0
-	//beq	.Lscal_kernel_zero
+	ldr	FLAG, [sp]
+	cmp	FLAG, #1
+	beq	.Lscal_kernel_nansafe

+	fcmp	DA, #0.0
+	beq	.Lscal_kernel_zero
+
+.Lscal_kernel_nansafe:
 	cmp	INC_X, #1
 	bne	.Lscal_kernel_S_BEGIN

--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@ -73,6 +73,15 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)

        for( i=0; i<n; i+=8 )
        {
+		x[0] = alpha;
+		x[1] = alpha;
+		x[2] = alpha;
+		x[3] = alpha;
+		x[4] = alpha;
+		x[5] = alpha;
+		x[6] = alpha;
+		x[7] = alpha;
+#if 0
                if(isfinite(x[0]))
 			x[0] = alpha;
 		else	
@ -106,7 +115,8 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)
 		else
 			x[7] = NAN;
                x+=8;
-        }
+#endif 
+	}

 }

@ -130,6 +140,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
+				if (dummy2 == 0)
+				for (j = 0; j < align; j++) {
+					x [j] = 0.0;
+				}
+				else
 				for (j = 0; j < align; j++) {
 					if (isfinite(x[j]))
 						x[j] = 0.0;
@ -151,7 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 				j=n1;
 			}
 #endif
-
+			if (dummy2 == 0)
+			while(j < n)
+			{
+				x[j]=0.0;
+				j++;
+			}
+			else
 			while(j < n)
 			{
 				if (!isfinite(x[j]))
@ -202,7 +223,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS

 		if ( da == 0.0 )
 		{		
-
+		if (dummy2 == 0)
+			while(j < n)
+			{
+				x[i]=0.0;
+				i += inc_x;
+				j++;
+			}
+		else
 			while(j < n)
 			{
 				if (!isfinite(x[i]))
--- a/kernel/power/scal.S
+++ b/kernel/power/scal.S
@ -47,9 +47,11 @@
 #ifndef __64BIT__
 #define X r6
 #define INCX r7
+#define FLAG r11
 #else
 #define X r7
 #define INCX r8
+#define FLAG r12
 #endif
 #endif

@ -57,9 +59,11 @@
 #if !defined(__64BIT__) && defined(DOUBLE)
 #define X r8
 #define INCX r9
+#define FLAG r13
 #else
 #define X r7
 #define INCX r8
+#define FLAG r12
 #endif
 #endif

@ -84,9 +88,12 @@
 	cmpwi	cr0, N, 0
 	blelr-	cr0

-//	fcmpu	cr0, FZERO, ALPHA
-//	bne-	cr0, LL(A1I1)
-	b LL(A1I1)
+	fcmpu	cr0, FZERO, ALPHA
+	bne-	cr0, LL(A1I1)
+
+	ld      FLAG,    48+64+8(SP)
+	cmpwi   cr0, FLAG, 1
+	beq-   cr0, LL(A1I1)

 	cmpwi	cr0, INCX, SIZE
 	bne-	cr0, LL(A0IN)
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@ -74,7 +74,24 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )

        for( i=0; i<n; i+=8 )
        {
-                if (isfinite(x[0]))
+		x[0] = alpha;
+		x[1] = alpha;
+		x[2] = alpha;
+		x[3] = alpha;
+		x[4] = alpha;
+		x[5] = alpha;
+		x[6] = alpha;
+		x[7] = alpha;
+		x[8] = alpha;
+		x[9] = alpha;
+		x[10] = alpha;
+		x[11] = alpha;
+		x[12] = alpha;
+		x[13] = alpha;
+		x[14] = alpha;
+		x[15] = alpha;
+#if 0
+		if (isfinite(x[0]))
 			x[0] = alpha;
 		else
 			x[0] = NAN;
@ -107,7 +124,8 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )
 		else
 			x[7] = NAN;
                x+=8;
-        }
+#endif
+    	}

 }

@ -132,6 +150,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
+				if (dummy2 == 0)
+					for (j = 0; j < align; j++){
+						x[j] = 0.0;
+					}
+				else
 				for (j = 0; j < align; j++) {
 					if (isfinite(x[j]))
 						x[j] = 0.0;
@ -153,9 +176,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 				j=n1;
 			}
 #endif
-
+			if (dummy2 == 0)
 			while(j < n)
 			{
+				x[j] = 0.0;
+				j++;
+			}
+			else
+			while(j < n)
+			{	
 				if (isfinite(x[j]))
 					x[j]=0.0;
 				else
@ -204,7 +233,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS

 		if ( da == 0.0 )
 		{		
-
+		if (dummy2 == 0)
+			while(j < n)
+			{
+				x[i]=0.0;
+				i += inc_x;
+				j++;
+			}
+		else
 			while(j < n)
 			{
 				if (isfinite(x[i]))
--- a/kernel/riscv64/scal.c
+++ b/kernel/riscv64/scal.c
@ -43,9 +43,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (n <= 0) || (inc_x <= 0))
 		return(0);
 	
-
-	while(j < n)
-	{
+	if (dummy2 == 0) {
+		while(j < n)
+		{

 		if ( da == 0.0 )
 			if (isfinite(x[i]))
@ -57,7 +57,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS

 		i += inc_x ;
 		j++;
+		}
+	} else {
+		while(j < n)
+		{

+		if ( da == 0.0 )
+			x[i]=0.0;
+		else
+			x[i] = da * x[i] ;
+
+		i += inc_x ;
+		j++;
+		}
 	}
 	return 0;

--- a/kernel/riscv64/scal_rvv.c
+++ b/kernel/riscv64/scal_rvv.c
@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
    FLOAT_V_T v0;
 
    if(inc_x == 1) {
-        if(da == 0.0) {
+        if(dummy2 == 0 && da == 0.0) {
            int gvl = VSETVL_MAX;
            v0 = VFMVVF_FLOAT(0.0, gvl);
            for (size_t vl; n > 0; n -= vl, x += vl) {
@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
    } else {
        BLASLONG stride_x = inc_x * sizeof(FLOAT);

-        if(da == 0.0) {
+        if(dummy2 == 0 && da == 0.0) {
            int gvl = VSETVL_MAX;
            v0 = VFMVVF_FLOAT(0.0, gvl);
            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
        FLOAT_V_T v0, v1;
        unsigned int gvl = 0;
        if(inc_x == 1){
-                if (0){ //if(da == 0.0){
+                if(dummy2 == 0 && da == 0.0){
                        memset(&x[0], 0, n * sizeof(FLOAT));
                }else{
                        gvl = VSETVL(n);
@ -96,7 +96,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                        }
                }
        }else{
-                if (0) { //if(da == 0.0){
+                if(dummy2 == 0 && da == 0.0){
                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
                        BLASLONG ix = 0;
                        gvl = VSETVL(n);
--- a/kernel/x86/scal.S
+++ b/kernel/x86/scal.S
@ -57,19 +57,24 @@
 #ifdef XDOUBLE
 	movl	44(%esp),%edi
 	movl	48(%esp),%esi
+	movl    64(%esp),%ecx
 #elif defined(DOUBLE)
 	movl	36(%esp),%edi
 	movl	40(%esp),%esi
+	movl	56(%esp),%ecx
 #else
 	movl	32(%esp),%edi
 	movl	36(%esp),%esi
+	movl	54(%esp),%ecx
 #endif

 	ftst
 	fnstsw	%ax
 	andb	$68, %ah
-//	je	.L300		# Alpha != ZERO
-	jmp	.L300
+	je	.L300		# Alpha != ZERO
+
+	cmpl	$1,%ecx		# dummy2 flag
+	je	.L300

 /* Alpha == ZERO */
 	cmpl	$1,%esi
--- a/kernel/x86_64/scal_atom.S
+++ b/kernel/x86_64/scal_atom.S
@ -60,8 +60,10 @@
 #ifdef WINDOWS_ABI
 	movq	40(%rsp), X
 	movq	48(%rsp), INCX
-
+	movq    64(%rsp), %r9
 	movaps	%xmm3, %xmm0
+#else
+	movq	24(%rsp), %r9
 #endif

 	SAVEREGISTERS
@ -73,6 +75,10 @@
 	lea	(, INCX, SIZE), INCX
 	comisd	%xmm0, %xmm1
 	jne	.L100
+	jp	.L100
+
+	cmpq	$1, %r9
+	je	.L100

 /* Alpha == ZERO */
 	cmpq	$SIZE, INCX
--- a/kernel/x86_64/scal_sse.S
+++ b/kernel/x86_64/scal_sse.S
@ -60,8 +60,10 @@
 #ifdef WINDOWS_ABI
 	movq	40(%rsp), X
 	movq	48(%rsp), INCX
-
+	movq	64(%rsp), %r9
 	movaps	%xmm3, %xmm0
+#else
+	movq    24(%rsp), %r9
 #endif

 	SAVEREGISTERS
@ -76,6 +78,8 @@
 	shufps	$0, %xmm0, %xmm0

 	jne	.L100		# Alpha != ZERO
+
+	cmpq   $1, %r9
 	je	.L100
 /* Alpha == ZERO */
 	cmpq	$SIZE, INCX
--- a/kernel/x86_64/scal_sse2.S
+++ b/kernel/x86_64/scal_sse2.S
@ -48,6 +48,7 @@
 #define X	ARG2
 #define INCX	ARG3
 #endif
+#define FLAG    %r9

 #define XX	%r10
 #define I	%rax
@ -60,8 +61,10 @@
 #ifdef WINDOWS_ABI
 	movq	40(%rsp), X
 	movq	48(%rsp), INCX
-
+	movq	64(%rsp), FLAG
 	movaps	%xmm3, %xmm0
+#else
+	movq    24(%rsp), FLAG
 #endif

 	SAVEREGISTERS
@ -75,6 +78,8 @@
 	comisd	%xmm0, %xmm1
 	jne	.L100		# Alpha != ZERO
 	jp	.L100		# For Alpha = NaN
+
+	cmpq $1, FLAG
 	je 	.L100		# disable the Alpha=zero path as it does not handle x=inf or nan
 /* Alpha == ZERO */
 	cmpq	$SIZE, INCX
--- a/kernel/x86_64/zscal_atom.S
+++ b/kernel/x86_64/zscal_atom.S
@ -74,7 +74,7 @@
 	pxor	%xmm15, %xmm15
 	comisd	%xmm0, %xmm15
 	jne	.L30		# Alpha_r != ZERO
-
+	jp	.L30
 	comisd	%xmm1, %xmm15
 	jne	.L30		# Alpha_i != ZERO

--- a/kernel/x86_64/zscal_sse.S
+++ b/kernel/x86_64/zscal_sse.S
@ -76,7 +76,7 @@
 	pxor	%xmm15, %xmm15
 	comiss	%xmm0, %xmm15
 	jne	.L100		# Alpha_r != ZERO
-
+	jp	.L100		# Alpha_r == NAN	
 	comiss	%xmm1, %xmm15
 	jne	.L100		# Alpha_i != ZERO