From 276e3ebf9e1405196d36c2570bce8376b8bad2fd Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 26 Jan 2024 10:03:50 +0800
Subject: [PATCH 01/11] LoongArch64: Add dzamax and dzamin opt

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   2 +
 kernel/loongarch64/camax_lasx.S          | 150 +++++++++-------
 kernel/loongarch64/camax_lsx.S           | 195 +++++++++++---------
 kernel/loongarch64/camin_lasx.S          | 164 +++++++++--------
 kernel/loongarch64/camin_lsx.S           | 215 +++++++++++++----------
 6 files changed, 421 insertions(+), 307 deletions(-)

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index c365e9a75..e27ce3bee 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -14,10 +14,12 @@ ZSCALKERNEL  = cscal_lsx.S
 SAMAXKERNEL =  amax_lsx.S
 DAMAXKERNEL =  amax_lsx.S
 CAMAXKERNEL =  camax_lsx.S
+ZAMAXKERNEL =  camax_lsx.S
 
 SAMINKERNEL =  amin_lsx.S
 DAMINKERNEL =  amin_lsx.S
 CAMINKERNEL =  camin_lsx.S
+ZAMINKERNEL =  camin_lsx.S
 
 SMAXKERNEL  =  max_lsx.S
 DMAXKERNEL  =  max_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 68360faaf..f4429cfba 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -14,10 +14,12 @@ ZSCALKERNEL  = cscal_lasx.S
 SAMAXKERNEL =  amax_lasx.S
 DAMAXKERNEL =  amax_lasx.S
 CAMAXKERNEL =  camax_lasx.S
+ZAMAXKERNEL =  camax_lasx.S
 
 SAMINKERNEL =  amin_lasx.S
 DAMINKERNEL =  amin_lasx.S
 CAMINKERNEL =  camin_lasx.S
+ZAMINKERNEL =  camin_lasx.S
 
 SMAXKERNEL  =  max_lsx.S
 DMAXKERNEL =   max_lsx.S
diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S
index 7013430cb..f9a4e9012 100644
--- a/kernel/loongarch64/camax_lasx.S
+++ b/kernel/loongarch64/camax_lasx.S
@@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     bge $r0, N, .L999
     bge $r0, INCX, .L999
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    xvreplgr2vr.w neg1, I
-    xvffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 8 * SIZE
-    addi.d I, I, -1
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
     xvpickev.w x1, VX1, VX0
     xvpickod.w x2, VX1, VX0
-    xvfmul.s x3, neg1, x1
-    xvfmul.s x4, neg1, x2
-    xvfcmp.clt.s VT0, x1, res0
-    xvfcmp.clt.s VT1, x2, res0
-    xvbitsel.v x1, x1, x3, VT0
-    xvbitsel.v x2, x2, x4, VT1
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
     addi.d X, X, 16 * SIZE
-    xvfadd.s VM1, x1, x2
-    xvfmax.s VM0, VM0, VM1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMAX VM0, x1, x2
+#else
     xvpickve.w x1, VM0, 0
     xvpickve.w x2, VM0, 1
     xvpickve.w x3, VM0, 2
     xvpickve.w x4, VM0, 3
-    xvfmax.s VM1, x1, x2
-    xvfmax.s VM0, x3, x4
-    xvfmax.s VM0, VM0, VM1
+    XVFMAX VM0, x1, x2
+    XVFMAX VM1, x3, x4
+    XVFMAX VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmax.s s1, s1, s2
-    fmax.s s3, s3, s4
-    fmax.s s1, s1, s3
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     FABS a1, a1
     ADD a0, a0, a1
     add.d  X, X, INCX
-    fmax.s s1, a0, s1
+    FMAX s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S
index 2e55629de..cf46cb016 100644
--- a/kernel/loongarch64/camax_lsx.S
+++ b/kernel/loongarch64/camax_lsx.S
@@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     bge $r0, N, .L999
     bge $r0, INCX, .L999
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    vreplgr2vr.w neg1, I
-    vffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
     vpickev.w x1, VX1, VX0
     vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
-    vld VX0, X, 8 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vld VX1, X, 12 * SIZE
-    vfadd.s VM1, x1, x2
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
     vpickev.w x1, VX1, VX0
     vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#endif
     addi.d X, X, 16 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vfadd.s x1, x1, x2
-    vfmax.s VM1, x1, VM1
-    vfmax.s VM0, VM0, VM1
+    addi.d I, I, -1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMAX VM0, x1, x2
+#else
     vreplvei.w x1, VM0, 0
     vreplvei.w x2, VM0, 1
     vreplvei.w x3, VM0, 2
     vreplvei.w x4, VM0, 3
-    vfmax.s VM1, x1, x2
-    vfmax.s VM0, x3, x4
-    vfmax.s VM0, VM0, VM1
+    VFMAX VM1, x1, x2
+    VFMAX VM0, x3, x4
+    VFMAX VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmax.s s1, s1, s2
-    fmax.s s3, s3, s4
-    fmax.s s1, s1, s3
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L24:
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
     addi.d I, I, -1
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s a0, a0, a1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
     add.d  X, X, INCX
-    fmax.s s1, a0, s1
+    FMAX s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S
index d7931d30a..c1c4c98c8 100644
--- a/kernel/loongarch64/camin_lasx.S
+++ b/kernel/loongarch64/camin_lasx.S
@@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvxor.v res0, res0, res0
     bge $r0, N, .L999
     bge $r0, INCX, .L999
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s s1, a1, a0
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+#else
     xvreplve0.w VM0, VM0
+#endif
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    xvreplgr2vr.w neg1, I
-    xvffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 8 * SIZE
-    addi.d I, I, -1
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
     xvpickev.w x1, VX1, VX0
     xvpickod.w x2, VX1, VX0
-    xvfmul.s x3, neg1, x1
-    xvfmul.s x4, neg1, x2
-    xvfcmp.clt.s VT0, x1, res0
-    xvfcmp.clt.s VT1, x2, res0
-    xvbitsel.v x1, x1, x3, VT0
-    xvbitsel.v x2, x2, x4, VT1
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
     addi.d X, X, 16 * SIZE
-    xvfadd.s VM1, x1, x2
-    xvfmin.s VM0, VM0, VM1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMIN VM0, x1, x2
+#else
     xvpickve.w x1, VM0, 0
     xvpickve.w x2, VM0, 1
     xvpickve.w x3, VM0, 2
     xvpickve.w x4, VM0, 3
-    xvfmin.s VM1, x1, x2
-    xvfmin.s VM0, x3, x4
-    xvfmin.s VM0, VM0, VM1
+    XVFMIN VM0, x1, x2
+    XVFMIN VM1, x3, x4
+    XVFMIN VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmin.s s1, s1, s2
-    fmin.s s3, s3, s4
-    fmin.s s1, s1, s3
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     FABS a1, a1
     ADD a0, a0, a1
     add.d  X, X, INCX
-    fmin.s s1, a0, s1
+    FMIN s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S
index e9ad6b04d..ff666ea8f 100644
--- a/kernel/loongarch64/camin_lsx.S
+++ b/kernel/loongarch64/camin_lsx.S
@@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     vxor.v res0, res0, res0
     bge $r0, N, .L999
     bge $r0, INCX, .L999
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s s1, a1, a0
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    vreplvei.d VM0, VM0, 0
+#else
     vreplvei.w VM0, VM0, 0
+#endif
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    vreplgr2vr.w neg1, I
-    vffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#endif
     addi.d I, I, -1
-    vpickev.w x1, VX1, VX0
-    vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
-    vld VX0, X, 8 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vld VX1, X, 12 * SIZE
-    vfadd.s VM1, x1, x2
-    vpickev.w x1, VX1, VX0
-    vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
     addi.d X, X, 16 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vfadd.s x1, x1, x2
-    vfmin.s VM1, x1, VM1
-    vfmin.s VM0, VM0, VM1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMIN VM0, x1, x2
+#else
     vreplvei.w x1, VM0, 0
     vreplvei.w x2, VM0, 1
     vreplvei.w x3, VM0, 2
     vreplvei.w x4, VM0, 3
-    vfmin.s VM1, x1, x2
-    vfmin.s VM0, x3, x4
-    vfmin.s VM0, VM0, VM1
+    VFMIN VM1, x1, x2
+    VFMIN VM0, x3, x4
+    VFMIN VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmin.s s1, s1, s2
-    fmin.s s3, s3, s4
-    fmin.s s1, s1, s3
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L24:
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
     addi.d I, I, -1
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s a0, a0, a1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
     add.d  X, X, INCX
-    fmin.s s1, a0, s1
+    FMIN s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 

From 519ea6e87aa357787896986836f853192c829930 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 10:39:22 +0800
Subject: [PATCH 02/11] utest: Add utest for the {sc/dz}amax and
 {s/d/sc/dz}amin

---
 utest/CMakeLists.txt |  1 +
 utest/Makefile       |  3 +-
 utest/test_amax.c    | 35 +++++++++++++++--
 utest/test_amin.c    | 89 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 5 deletions(-)
 create mode 100644 utest/test_amin.c

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index c47954ce4..41829bd22 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -16,6 +16,7 @@ else ()
     test_dnrm2.c
     test_swap.c
     test_zscal.c
+    test_amin.c
   )
 endif ()
 
diff --git a/utest/Makefile b/utest/Makefile
index d0715c754..8acaa3ea9 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -11,7 +11,8 @@ UTESTBIN=openblas_utest
 
 include $(TOPDIR)/Makefile.system
 
-OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o
+OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \
+     test_amin.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
 
 ifneq ($(NO_LAPACK), 1)
diff --git a/utest/test_amax.c b/utest/test_amax.c
index a9e5a1c85..e9775caf0 100644
--- a/utest/test_amax.c
+++ b/utest/test_amax.c
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2016, The OpenBLAS Project
+Copyright (c) 2011-2024, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -57,4 +57,31 @@ CTEST(amax, damax){
   ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
 }
 #endif
+#ifdef BUILD_COMPLEX
+CTEST(amax, scamax){
+  blasint N = 9, inc = 1;
+  float te_max = 0.0, tr_max = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		-7.7, 8.8 };
 
+  te_max = BLASFUNC(scamax)(&N, x, &inc);
+  tr_max = 20.0;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX16
+CTEST(amax, dzamax){
+  blasint N = 9, inc = 1;
+  double te_max = 0.0, tr_max = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		 -7.7, 8.8 };
+
+  te_max = BLASFUNC(dzamax)(&N, x, &inc);
+  tr_max = 20.0;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_amin.c b/utest/test_amin.c
new file mode 100644
index 000000000..1305ab8ef
--- /dev/null
+++ b/utest/test_amin.c
@@ -0,0 +1,89 @@
+/*****************************************************************************
+Copyright (c) 2011-2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+#ifdef BUILD_SINGLE
+CTEST(amin, samin){
+  blasint N = 3, inc = 1;
+  float te_min = 0.0, tr_min = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9 };
+
+  te_min = BLASFUNC(samin)(&N, x, &inc);
+  tr_min = 1.1;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_DOUBLE
+CTEST(amin, damin){
+  blasint N = 3, inc = 1;
+  double te_min = 0.0, tr_min = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9 };
+
+  te_min = BLASFUNC(damin)(&N, x, &inc);
+  tr_min = 1.1;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX
+CTEST(amin, scamin){
+  blasint N = 9, inc = 1;
+  float te_min = 0.0, tr_min = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		-7.7, 8.8 };
+
+  te_min = BLASFUNC(scamin)(&N, x, &inc);
+  tr_min = 3.3;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX16
+CTEST(amin, dzamin){
+  blasint N = 9, inc = 1;
+  double te_min = 0.0, tr_min = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		 -7.7, 8.8 };
+
+  te_min = BLASFUNC(dzamin)(&N, x, &inc);
+  tr_min = 3.3;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
+}
+#endif

From a79d11740580db13f101b76c81a02f93654de9a7 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 11:03:56 +0800
Subject: [PATCH 03/11] LoogArch64: Fixed bug for {s/d}amin

---
 kernel/loongarch64/amin_lasx.S | 1 -
 kernel/loongarch64/amin_lsx.S  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S
index 0a4359002..c91a33006 100644
--- a/kernel/loongarch64/amin_lasx.S
+++ b/kernel/loongarch64/amin_lasx.S
@@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     xvldrepl.w VM0, X, 0
 #endif
-    XVFSUB VM0, VM0, VM0
     bne INCX, TEMP, .L20
 
     srai.d I, N, 4
diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
index 644caf43c..c3c3f4ae9 100644
--- a/kernel/loongarch64/amin_lsx.S
+++ b/kernel/loongarch64/amin_lsx.S
@@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     vldrepl.w VM0, X, 0
 #endif
-    VFSUB VM0, VM0, VM0
     bne INCX, TEMP, .L20
 
     srai.d I, N, 3

From 3d4dfd008556b5a722162def487e0553f807e6e8 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 11:25:59 +0800
Subject: [PATCH 04/11] Benchmark: Rename the executable file names for
 {sc/dz}a{min/max}

No interface named {c/z}a{min/max}, keeping it would
cause ambiguity
---
 benchmark/Makefile | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmark/Makefile b/benchmark/Makefile
index d9ddb9042..6a7c54636 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -265,9 +265,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
        ismax.goto idmax.goto \
        isamin.goto idamin.goto icamin.goto izamin.goto \
        ismin.goto idmin.goto \
-       samax.goto damax.goto camax.goto zamax.goto \
+       samax.goto damax.goto scamax.goto dzamax.goto \
        smax.goto dmax.goto \
-       samin.goto damin.goto camin.goto zamin.goto \
+       samin.goto damin.goto scamin.goto dzamin.goto \
        smin.goto dmin.goto \
        saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
        snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
@@ -2832,12 +2832,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
 damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## CAMAX ##############################################
-camax.goto : camax.$(SUFFIX) ../$(LIBNAME)
+############################################## SCAMAX ##############################################
+scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## ZAMAX ##############################################
-zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME)
+############################################## DZAMAX ##############################################
+dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 ############################################## SMAX ##############################################
@@ -2856,12 +2856,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
 damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## CAMIN ##############################################
-camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
+############################################## SCAMIN ##############################################
+scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## ZAMIN ##############################################
-zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
+############################################## DZAMIN ##############################################
+dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 ############################################## SMIN ##############################################
@@ -3383,10 +3383,10 @@ samax.$(SUFFIX) : amax.c
 damax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
 
-camax.$(SUFFIX) : amax.c
+scamax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX  -UDOUBLE -o $(@F) $^
 
-zamax.$(SUFFIX) : amax.c
+dzamax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX  -DDOUBLE -o $(@F) $^
 
 
@@ -3403,10 +3403,10 @@ samin.$(SUFFIX) : amin.c
 damin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
 
-camin.$(SUFFIX) : amin.c
+scamin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
 
-zamin.$(SUFFIX) : amin.c
+dzamin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
 

From 83ce97a4ca44c1aedc9f825bcb11f3a999f09c60 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 16:54:14 +0800
Subject: [PATCH 05/11] LoongArch64: Handle NAN and INF

---
 kernel/loongarch64/cscal_lasx.S | 149 +-------------------------------
 kernel/loongarch64/cscal_lsx.S  | 130 +---------------------------
 2 files changed, 4 insertions(+), 275 deletions(-)

diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S
index 3605a6c0e..f53526663 100644
--- a/kernel/loongarch64/cscal_lasx.S
+++ b/kernel/loongarch64/cscal_lasx.S
@@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L113 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L14:
-    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
     b .L111 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L112:  //alpha_r == 0.0 && alpha_i != 0.0
-    xvld VX0, X, 0 * SIZE
-#ifdef DOUBLE
-    xvld VX1, X, 4 * SIZE
-    xvpickev.d x1, VX1, VX0
-    xvpickod.d x2, VX1, VX0
-    xvfmul.d x3, VXAI, x2
-    xvfsub.d x3, VXZ, x3
-    xvfmul.d x4, VXAI, x1
-    xvilvl.d VX2, x4 ,x3
-    xvilvh.d VX3, x4, x3
-    xvst VX2, X, 0 * SIZE
-    xvst VX3, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-#else
-    xvld VX1, X, 8 * SIZE
-    xvpickev.w x1, VX1, VX0
-    xvpickod.w x2, VX1, VX0
-    xvfmul.s x3, VXAI, x2
-    xvfsub.s x3, VXZ, x3
-    xvfmul.s x4, VXAI, x1
-    xvilvl.w VX2, x4 ,x3
-    xvilvh.w VX3, x4, x3
-    xvst VX2, X, 0 * SIZE
-    xvst VX3, X, 8 * SIZE
-    addi.d X, X, 16 * SIZE
-#endif
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
 .L113: //alpha_r != 0.0 && alpha_i == 0.0
     xvld VX0, X, 0 * SIZE
 #ifdef DOUBLE
@@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L223 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L24:
-    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
     b .L221 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L222:  //alpha_r == 0.0 && alpha_i != 0.0
-#ifdef DOUBLE
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d x1, t1, 0
-    xvinsgr2vr.d x2, t2, 0
-    xvinsgr2vr.d x1, t3, 1
-    xvinsgr2vr.d x2, t4, 1
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    xvinsgr2vr.d x1, t1, 2
-    xvinsgr2vr.d x2, t2, 2
-    xvinsgr2vr.d x1, t3, 3
-    xvinsgr2vr.d x2, t4, 3
-    add.d X, X, INCX
-
-    xvfmul.d x3, VXAI, x2
-    xvfsub.d x3, VXZ, x3
-    xvfmul.d x4, VXAI, x1
-    addi.d  I, I, -1
-    xvstelm.d x3, XX, 0 * SIZE, 0
-    xvstelm.d x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    xvstelm.d x3, XX, 0 * SIZE, 1
-    xvstelm.d x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-    xvstelm.d x3, XX, 0 * SIZE, 2
-    xvstelm.d x4, XX, 1 * SIZE, 2
-    add.d XX, XX, INCX
-    xvstelm.d x3, XX, 0 * SIZE, 3
-    xvstelm.d x4, XX, 1 * SIZE, 3
-#else
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w x1, t1, 0
-    xvinsgr2vr.w x2, t2, 0
-    xvinsgr2vr.w x1, t3, 1
-    xvinsgr2vr.w x2, t4, 1
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    xvinsgr2vr.w x1, t1, 2
-    xvinsgr2vr.w x2, t2, 2
-    xvinsgr2vr.w x1, t3, 3
-    xvinsgr2vr.w x2, t4, 3
-    add.d X, X, INCX
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w x1, t1, 4
-    xvinsgr2vr.w x2, t2, 4
-    xvinsgr2vr.w x1, t3, 5
-    xvinsgr2vr.w x2, t4, 5
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    xvinsgr2vr.w x1, t1, 6
-    xvinsgr2vr.w x2, t2, 6
-    xvinsgr2vr.w x1, t3, 7
-    xvinsgr2vr.w x2, t4, 7
-    add.d X, X, INCX
-
-    xvfmul.s x3, VXAI, x2
-    xvfsub.s x3, VXZ, x3
-    xvfmul.s x4, VXAI, x1
-    addi.d  I, I, -1
-    xvstelm.w x3, XX, 0 * SIZE, 0
-    xvstelm.w x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 1
-    xvstelm.w x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 2
-    xvstelm.w x4, XX, 1 * SIZE, 2
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 3
-    xvstelm.w x4, XX, 1 * SIZE, 3
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 4
-    xvstelm.w x4, XX, 1 * SIZE, 4
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 5
-    xvstelm.w x4, XX, 1 * SIZE, 5
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 6
-    xvstelm.w x4, XX, 1 * SIZE, 6
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 7
-    xvstelm.w x4, XX, 1 * SIZE, 7
-#endif
-    add.d XX, XX, INCX
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
 .L223: //alpha_r != 0.0 && alpha_i == 0.0
 #ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S
index f442a754f..241d3d16e 100644
--- a/kernel/loongarch64/cscal_lsx.S
+++ b/kernel/loongarch64/cscal_lsx.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L113 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L14:
-    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
     b .L111 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L112:  //alpha_r == 0.0 && alpha_i != 0.0
-    vld VX0, X, 0 * SIZE
-#ifdef DOUBLE
-    vld VX1, X, 2 * SIZE
-    vpickev.d x1, VX1, VX0
-    vpickod.d x2, VX1, VX0
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    vilvl.d VX2, x4 ,x3
-    vilvh.d VX3, x4, x3
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vpickev.d x1, VX1, VX0
-    vpickod.d x2, VX1, VX0
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    vilvl.d VX2, x4 ,x3
-    vilvh.d VX3, x4, x3
-    vst VX2, X, 4 * SIZE
-    vst VX3, X, 6 * SIZE
-#else
-    vld VX1, X, 4 * SIZE
-    vpickev.w x1, VX1, VX0
-    vpickod.w x2, VX1, VX0
-    vfmul.s x3, VXAI, x2
-    vfsub.s x3, VXZ, x3
-    vfmul.s x4, VXAI, x1
-    vilvl.w VX2, x4 ,x3
-    vilvh.w VX3, x4, x3
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 4 * SIZE
-#endif
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
 .L113: //alpha_r != 0.0 && alpha_i == 0.0
     vld VX0, X, 0 * SIZE
 #ifdef DOUBLE
@@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L223 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L24:
-    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
     b .L221 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L222:  //alpha_r == 0.0 && alpha_i != 0.0
-#ifdef DOUBLE
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d x1, t1, 0
-    vinsgr2vr.d x2, t2, 0
-    vinsgr2vr.d x1, t3, 1
-    vinsgr2vr.d x2, t4, 1
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    vstelm.d x3, XX, 0 * SIZE, 0
-    vstelm.d x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    vstelm.d x3, XX, 0 * SIZE, 1
-    vstelm.d x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    vinsgr2vr.d x1, t1, 0
-    vinsgr2vr.d x2, t2, 0
-    vinsgr2vr.d x1, t3, 1
-    vinsgr2vr.d x2, t4, 1
-    add.d X, X, INCX
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    addi.d  I, I, -1
-    vstelm.d x3, XX, 0 * SIZE, 0
-    vstelm.d x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    vstelm.d x3, XX, 0 * SIZE, 1
-    vstelm.d x4, XX, 1 * SIZE, 1
-#else
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w x1, t1, 0
-    vinsgr2vr.w x2, t2, 0
-    vinsgr2vr.w x1, t3, 1
-    vinsgr2vr.w x2, t4, 1
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    vinsgr2vr.w x1, t1, 2
-    vinsgr2vr.w x2, t2, 2
-    vinsgr2vr.w x1, t3, 3
-    vinsgr2vr.w x2, t4, 3
-    add.d X, X, INCX
-
-    vfmul.s x3, VXAI, x2
-    vfsub.s x3, VXZ, x3
-    vfmul.s x4, VXAI, x1
-    addi.d  I, I, -1
-    vstelm.w x3, XX, 0 * SIZE, 0
-    vstelm.w x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    vstelm.w x3, XX, 0 * SIZE, 1
-    vstelm.w x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-    vstelm.w x3, XX, 0 * SIZE, 2
-    vstelm.w x4, XX, 1 * SIZE, 2
-    add.d XX, XX, INCX
-    vstelm.w x3, XX, 0 * SIZE, 3
-    vstelm.w x4, XX, 1 * SIZE, 3
-#endif
-    add.d XX, XX, INCX
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
 .L223: //alpha_r != 0.0 && alpha_i == 0.0
 #ifdef DOUBLE
     ld.d t1, X, 0 * SIZE

From bb043a021f138a3915c835776fdfe90673644db4 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 17:27:59 +0800
Subject: [PATCH 06/11] utest: Add tests for zscal

---
 utest/test_zscal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/utest/test_zscal.c b/utest/test_zscal.c
index 8992eee90..ffc851e8b 100644
--- a/utest/test_zscal.c
+++ b/utest/test_zscal.c
@@ -20,6 +20,18 @@ CTEST(zscal, i_nan)
     ASSERT_TRUE(isnan(nan[17]));
 }
 
+CTEST(zscal, i_nan_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0,
+                    NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 2);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
 CTEST(zscal, nan_i)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
@@ -30,7 +42,19 @@ CTEST(zscal, nan_i)
     ASSERT_TRUE(isnan(i[16]));
     ASSERT_TRUE(isnan(i[17]));
 }
-	    
+
+CTEST(zscal, nan_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+}
+
 CTEST(zscal, i_inf)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
@@ -40,7 +64,19 @@ CTEST(zscal, i_inf)
     ASSERT_TRUE(isinf(inf[1]));
     ASSERT_TRUE(isnan(inf[16]));
     ASSERT_TRUE(isinf(inf[17]));
-}    
+}
+
+CTEST(zscal, i_inf_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0,
+                    INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 2);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}
 
 CTEST(zscal, inf_i)
 {
@@ -53,4 +89,16 @@ CTEST(zscal, inf_i)
     ASSERT_TRUE(isinf(i[17]));
 }
 
+CTEST(zscal, inf_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
 #endif

From 969601a1dcfdc4c44174346b7c752fa338f00737 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Wed, 31 Jan 2024 11:20:25 +0800
Subject: [PATCH 07/11] X86_64: Fixed bug in zscal

Fixed handling of NAN and INF arguments when
inc is greater than 1.
---
 kernel/x86_64/zscal.c | 91 ++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 41 deletions(-)

diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 66c8a0d2b..bc79c0caf 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0] - da_i *x[1];	
-		t1 = da_r *x[2] - da_i *x[3];	
-		t2 = da_r *x[4] - da_i *x[5];	
-		t3 = da_r *x[6] - da_i *x[7];	
+		t0 = da_r *x[0] - da_i *x[1];
+		t1 = da_r *x[2] - da_i *x[3];
+		t2 = da_r *x[4] - da_i *x[5];
+		t3 = da_r *x[6] - da_i *x[7];
 
 		x[1] = da_r * x[1] + da_i * x[0];
 		x[3] = da_r * x[3] + da_i * x[2];
 		x[5] = da_r * x[5] + da_i * x[4];
 		x[7] = da_r * x[7] + da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -99,16 +99,16 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 =  - da_i *x[1];	
-		t1 =  - da_i *x[3];	
-		t2 =  - da_i *x[5];	
-		t3 =  - da_i *x[7];	
+		t0 =  - da_i *x[1];
+		t1 =  - da_i *x[3];
+		t2 =  - da_i *x[5];
+		t3 =  - da_i *x[7];
 
 		x[1] =  da_i * x[0];
 		x[3] =  da_i * x[2];
 		x[5] =  da_i * x[4];
 		x[7] =  da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -129,16 +129,16 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0];	
-		t1 = da_r *x[2];	
-		t2 = da_r *x[4];	
-		t3 = da_r *x[6];	
+		t0 = da_r *x[0];
+		t1 = da_r *x[2];
+		t2 = da_r *x[4];
+		t3 = da_r *x[6];
 
 		x[1] = da_r * x[1];
 		x[3] = da_r * x[3];
 		x[5] = da_r * x[5];
 		x[7] = da_r * x[7];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -157,14 +157,14 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x )
 	BLASLONG i;
 	for( i=0; i<n; i+=4 )
 	{
-		x[0] = 0.0;	
-		x[1] = 0.0;	
-		x[2] = 0.0;	
-		x[3] = 0.0;	
-		x[4] = 0.0;	
-		x[5] = 0.0;	
-		x[6] = 0.0;	
-		x[7] = 0.0;	
+		x[0] = 0.0;
+		x[1] = 0.0;
+		x[2] = 0.0;
+		x[3] = 0.0;
+		x[4] = 0.0;
+		x[5] = 0.0;
+		x[6] = 0.0;
+		x[7] = 0.0;
 		x+=8;
 	}
 
@@ -186,10 +186,10 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
 
 	for ( i=0; i<n; i+=4 )
 	{
-		t0 = da_r * x[0]         - da_i *x[1];	
-		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];	
-		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];	
-		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];	
+		t0 = da_r * x[0]         - da_i *x[1];
+		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];
+		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];
+		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];
 
 		x[1]               = da_i * x[0]       + da_r * x[1];
 		x[inc_x  +1]       = da_i * x[inc_x]   + da_r * x[inc_x  +1];
@@ -228,7 +228,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				while(j < n1)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					x[i+inc_x]=0.0;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					i += inc_x ;
@@ -253,11 +253,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				while(j < n1)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
-					temp1        = -da_i * x[i+1+inc_x];
+					if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
+						temp1	= NAN;
+					else
+						temp1   = -da_i * x[i+1+inc_x];
 					x[i+1+inc_x] =  da_i * x[i+inc_x];
 					x[i+inc_x]   =  temp1;
 					i += 2*inc_x ;
@@ -267,8 +273,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
 					i += inc_x ;
@@ -291,7 +300,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n1)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -305,7 +314,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -368,7 +377,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		}
 		i = n1 << 1;
 		j = n1;
-	
+
 	if ( da_r == 0.0 || da_r != da_r )
 	{
 		if ( da_i == 0.0 )
@@ -385,7 +394,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			}
 
 		}
-		else if (da_r < -FLT_MAX || da_r > FLT_MAX) { 
+		else if (da_r < -FLT_MAX || da_r > FLT_MAX) {
 			while(j < n)
 			{
 					x[i]= NAN;
@@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 				if (x[i] < -FLT_MAX || x[i] > FLT_MAX)
 					temp0 = NAN;
 				x[i+1]       =  da_i * x[i];
-				if ( x[i] == x[i]) //preserve NaN 
+				if ( x[i] == x[i]) //preserve NaN
 				  x[i]         =  temp0;
 				i += 2 ;
 				j++;
@@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		{
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 			}
 
-		}		
+		}
 
 	}
 

From 1a6fdb035308370c08c740da279b769615594980 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 15:57:57 +0100
Subject: [PATCH 08/11] Add prototypes for extensions ?AMIN/?AMAX and
 CAXPYC/ZAXPYC

---
 cblas.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cblas.h b/cblas.h
index ade2fca3a..3b74e25ee 100644
--- a/cblas.h
+++ b/cblas.h
@@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
+float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
@@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 
+void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+
 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@@ -290,7 +303,6 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
-
 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                  OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,

From b54cda849096ade35bd2f69341e3d02fa1543512 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 16:00:52 +0100
Subject: [PATCH 09/11] Unify creation of CBLAS interfaces for ?AMIN/?AMAX and
 C/ZAXPYC between gmake and cmake builds

---
 interface/CMakeLists.txt |  2 ++
 interface/Makefile       | 49 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 4e082928b..ed19b556a 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -130,6 +130,8 @@ endif ()
 foreach (float_type ${FLOAT_TYPES})
 
   if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
+    GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
+
     GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
diff --git a/interface/Makefile b/interface/Makefile
index 78335357b..99859cbf5 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -270,7 +270,8 @@ CSBLAS1OBJS   = \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
-	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
+	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
+	cblas_samin.$(SUFFIX)
 
 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -295,7 +296,8 @@ CDBLAS1OBJS   = \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
-	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
+	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
+	cblas_damin.$(SUFFIX)
 
 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -315,7 +317,7 @@ CCBLAS1OBJS   = \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
-	cblas_caxpby.$(SUFFIX) \
+	cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
 
 CCBLAS2OBJS   = \
@@ -340,12 +342,12 @@ CXERBLAOBJ = \
 
 CZBLAS1OBJS   = \
 	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
-	cblas_zcopy.$(SUFFIX) \
+	cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
-	cblas_zaxpby.$(SUFFIX) \
+	cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
 
 
@@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
 cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
 
+cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
 cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
+cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c
 cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 

From a7d004e820f1ccbc9f61b4b1353ccdb04f208690 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 17:55:42 +0100
Subject: [PATCH 10/11] Fix CBLAS prototype

---
 interface/max.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/interface/max.c b/interface/max.c
index f05977448..6c7d32bd9 100644
--- a/interface/max.c
+++ b/interface/max.c
@@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
 #else
 
+#ifdef COMPLEX
+FLOAT CNAME(blasint n, void *vx, blasint incx){
+  FLOAT *x = (FLOAT*) vx;
+#else
 FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+#endif
+  
   FLOAT ret;
 
   PRINT_DEBUG_CNAME;

From 47bd06476312598eea694f19a00a9191041b1586 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 20:49:43 +0100
Subject: [PATCH 11/11] Fix names in build rules

---
 interface/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 99859cbf5..ad4a0fb89 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1541,10 +1541,10 @@ cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
 cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
-cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c
+cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
-cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c
+cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
 cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
@@ -1553,10 +1553,10 @@ cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
 cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
 
-cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c
+cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
 
-cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c
+cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
 
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c