From 276e3ebf9e1405196d36c2570bce8376b8bad2fd Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 26 Jan 2024 10:03:50 +0800 Subject: [PATCH 01/11] LoongArch64: Add dzamax and dzamin opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/camax_lasx.S | 150 +++++++++------- kernel/loongarch64/camax_lsx.S | 195 +++++++++++--------- kernel/loongarch64/camin_lasx.S | 164 +++++++++-------- kernel/loongarch64/camin_lsx.S | 215 +++++++++++++---------- 6 files changed, 421 insertions(+), 307 deletions(-) diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c365e9a75..e27ce3bee 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 68360faaf..f4429cfba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S index 7013430cb..f9a4e9012 100644 --- a/kernel/loongarch64/camax_lasx.S +++ b/kernel/loongarch64/camax_lasx.S @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmax.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmax.s VM1, x1, x2 - xvfmax.s VM0, x3, x4 - xvfmax.s VM0, VM0, VM1 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S index 2e55629de..cf46cb016 100644 --- a/kernel/loongarch64/camax_lsx.S +++ b/kernel/loongarch64/camax_lsx.S @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmax.s VM1, x1, VM1 - vfmax.s VM0, VM0, VM1 + addi.d I, I, -1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmax.s VM1, x1, x2 - vfmax.s VM0, x3, x4 - vfmax.s VM0, VM0, VM1 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S index d7931d30a..c1c4c98c8 100644 --- a/kernel/loongarch64/camin_lasx.S +++ b/kernel/loongarch64/camin_lasx.S @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else xvreplve0.w VM0, VM0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmin.s VM1, x1, x2 - xvfmin.s VM0, x3, x4 - xvfmin.s VM0, VM0, VM1 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S index e9ad6b04d..ff666ea8f 100644 --- a/kernel/loongarch64/camin_lsx.S +++ b/kernel/loongarch64/camin_lsx.S @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else vreplvei.w VM0, VM0, 0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif addi.d I, I, -1 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmin.s VM1, x1, VM1 - vfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmin.s VM1, x1, x2 - vfmin.s VM0, x3, x4 - vfmin.s VM0, VM0, VM1 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 From 519ea6e87aa357787896986836f853192c829930 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 10:39:22 +0800 Subject: [PATCH 02/11] utest: Add utest for the {sc/dz}amax and {s/d/sc/dz}amin --- utest/CMakeLists.txt | 1 + utest/Makefile | 3 +- utest/test_amax.c | 35 +++++++++++++++-- utest/test_amin.c | 89 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 utest/test_amin.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index c47954ce4..41829bd22 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -16,6 +16,7 @@ else () test_dnrm2.c test_swap.c test_zscal.c + test_amin.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index d0715c754..8acaa3ea9 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ + test_amin.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c85..e9775caf0 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2016, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -57,4 +57,31 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } #endif +#ifdef BUILD_COMPLEX +CTEST(amax, scamax){ + blasint N = 9, inc = 1; + float te_max = 0.0, tr_max = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + te_max = BLASFUNC(scamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amax, dzamax){ + blasint N = 9, inc = 1; + double te_max = 0.0, tr_max = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_max = BLASFUNC(dzamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} +#endif diff --git a/utest/test_amin.c b/utest/test_amin.c new file mode 100644 index 000000000..1305ab8ef --- /dev/null +++ b/utest/test_amin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(amin, samin){ + blasint N = 3, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(samin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_DOUBLE +CTEST(amin, damin){ + blasint N = 3, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(damin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX +CTEST(amin, scamin){ + blasint N = 9, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(scamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amin, dzamin){ + blasint N = 9, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(dzamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif From a79d11740580db13f101b76c81a02f93654de9a7 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 11:03:56 +0800 Subject: [PATCH 03/11] LoogArch64: Fixed bug for {s/d}amin --- kernel/loongarch64/amin_lasx.S | 1 - kernel/loongarch64/amin_lsx.S | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S index 0a4359002..c91a33006 100644 --- a/kernel/loongarch64/amin_lasx.S +++ b/kernel/loongarch64/amin_lasx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else xvldrepl.w VM0, X, 0 #endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S index 644caf43c..c3c3f4ae9 100644 --- a/kernel/loongarch64/amin_lsx.S +++ b/kernel/loongarch64/amin_lsx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else vldrepl.w VM0, X, 0 #endif - VFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 3 From 3d4dfd008556b5a722162def487e0553f807e6e8 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 11:25:59 +0800 Subject: [PATCH 04/11] Benchmark: Rename the executable file names for {sc/dz}a{min/max} No interface named {c/z}a{min/max}, keeping it would cause ambiguity --- benchmark/Makefile | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index d9ddb9042..6a7c54636 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -265,9 +265,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ ismax.goto idmax.goto \ isamin.goto idamin.goto icamin.goto izamin.goto \ ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ + samax.goto damax.goto scamax.goto dzamax.goto \ smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ + samin.goto damin.goto scamin.goto dzamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) @@ -2832,12 +2832,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME) damax.goto : damax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMAX ############################################## +scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMAX ############################################## +dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMAX ############################################## @@ -2856,12 +2856,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME) damin.goto : damin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMIN ############################################## +scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMIN ############################################## +dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMIN ############################################## @@ -3383,10 +3383,10 @@ samax.$(SUFFIX) : amax.c damax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camax.$(SUFFIX) : amax.c +scamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamax.$(SUFFIX) : amax.c +dzamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ @@ -3403,10 +3403,10 @@ samin.$(SUFFIX) : amin.c damin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camin.$(SUFFIX) : amin.c +scamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamin.$(SUFFIX) : amin.c +dzamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ From 83ce97a4ca44c1aedc9f825bcb11f3a999f09c60 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 16:54:14 +0800 Subject: [PATCH 05/11] LoongArch64: Handle NAN and INF --- kernel/loongarch64/cscal_lasx.S | 149 +------------------------------- kernel/loongarch64/cscal_lsx.S | 130 +--------------------------- 2 files changed, 4 insertions(+), 275 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index 3605a6c0e..f53526663 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L113 //alpha_r != 0.0 && alpha_i == 0.0 .L14: - bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 b .L111 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: //alpha_r == 0.0 && alpha_i != 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAI, x2 - xvfsub.d x3, VXZ, x3 - xvfmul.d x4, VXAI, x1 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAI, x2 - xvfsub.s x3, VXZ, x3 - xvfmul.s x4, VXAI, x1 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - .L113: //alpha_r != 0.0 && alpha_i == 0.0 xvld VX0, X, 0 * SIZE #ifdef DOUBLE @@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L223 //alpha_r != 0.0 && alpha_i == 0.0 .L24: - bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 b .L221 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L222: //alpha_r == 0.0 && alpha_i != 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAI, x2 - xvfsub.d x3, VXZ, x3 - xvfmul.d x4, VXAI, x1 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAI, x2 - xvfsub.s x3, VXZ, x3 - xvfmul.s x4, VXAI, x1 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L222 - b .L997 - .align 3 - .L223: //alpha_r != 0.0 && alpha_i == 0.0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S index f442a754f..241d3d16e 100644 --- a/kernel/loongarch64/cscal_lsx.S +++ b/kernel/loongarch64/cscal_lsx.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L113 //alpha_r != 0.0 && alpha_i == 0.0 .L14: - bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 b .L111 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: //alpha_r == 0.0 && alpha_i != 0.0 - vld VX0, X, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE -#else - vld VX1, X, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, VXAI, x2 - vfsub.s x3, VXZ, x3 - vfmul.s x4, VXAI, x1 - vilvl.w VX2, x4 ,x3 - vilvh.w VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - .L113: //alpha_r != 0.0 && alpha_i == 0.0 vld VX0, X, 0 * SIZE #ifdef DOUBLE @@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L223 //alpha_r != 0.0 && alpha_i == 0.0 .L24: - bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 b .L221 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L222: //alpha_r == 0.0 && alpha_i != 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - addi.d I, I, -1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - - vfmul.s x3, VXAI, x2 - vfsub.s x3, VXZ, x3 - vfmul.s x4, VXAI, x1 - addi.d I, I, -1 - vstelm.w x3, XX, 0 * SIZE, 0 - vstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 1 - vstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 2 - vstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 3 - vstelm.w x4, XX, 1 * SIZE, 3 -#endif - add.d XX, XX, INCX - blt $r0, I, .L222 - b .L997 - .align 3 - .L223: //alpha_r != 0.0 && alpha_i == 0.0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE From bb043a021f138a3915c835776fdfe90673644db4 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 17:27:59 +0800 Subject: [PATCH 06/11] utest: Add tests for zscal --- utest/test_zscal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/utest/test_zscal.c b/utest/test_zscal.c index 8992eee90..ffc851e8b 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -20,6 +20,18 @@ CTEST(zscal, i_nan) ASSERT_TRUE(isnan(nan[17])); } +CTEST(zscal, i_nan_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, + NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 2); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + CTEST(zscal, nan_i) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; @@ -30,7 +42,19 @@ CTEST(zscal, nan_i) ASSERT_TRUE(isnan(i[16])); ASSERT_TRUE(isnan(i[17])); } - + +CTEST(zscal, nan_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + CTEST(zscal, i_inf) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; @@ -40,7 +64,19 @@ CTEST(zscal, i_inf) ASSERT_TRUE(isinf(inf[1])); ASSERT_TRUE(isnan(inf[16])); ASSERT_TRUE(isinf(inf[17])); -} +} + +CTEST(zscal, i_inf_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, + INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 2); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} CTEST(zscal, inf_i) { @@ -53,4 +89,16 @@ CTEST(zscal, inf_i) ASSERT_TRUE(isinf(i[17])); } +CTEST(zscal, inf_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + #endif From 969601a1dcfdc4c44174346b7c752fa338f00737 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 31 Jan 2024 11:20:25 +0800 Subject: [PATCH 07/11] X86_64: Fixed bug in zscal Fixed handling of NAN and INF arguments when inc is greater than 1. --- kernel/x86_64/zscal.c | 91 ++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 66c8a0d2b..bc79c0caf 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) for( i=0; i FLT_MAX) { + else if (da_r < -FLT_MAX || da_r > FLT_MAX) { while(j < n) { x[i]= NAN; @@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (x[i] < -FLT_MAX || x[i] > FLT_MAX) temp0 = NAN; x[i+1] = da_i * x[i]; - if ( x[i] == x[i]) //preserve NaN + if ( x[i] == x[i]) //preserve NaN x[i] = temp0; i += 2 ; j++; @@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { while(j < n) { - + temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; @@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - } + } } From 1a6fdb035308370c08c740da279b769615594980 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 15:57:57 +0100 Subject: [PATCH 08/11] Add prototypes for extensions ?AMIN/?AMAX and CAXPYC/ZAXPYC --- cblas.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cblas.h b/cblas.h index ade2fca3a..3b74e25ee 100644 --- a/cblas.h +++ b/cblas.h @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); + void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); @@ -290,7 +303,6 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, From b54cda849096ade35bd2f69341e3d02fa1543512 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 16:00:52 +0100 Subject: [PATCH 09/11] Unify creation of CBLAS interfaces for ?AMIN/?AMAX and C/ZAXPYC between gmake and cmake builds --- interface/CMakeLists.txt | 2 ++ interface/Makefile | 49 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 4e082928b..ed19b556a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -130,6 +130,8 @@ endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) diff --git a/interface/Makefile b/interface/Makefile index 78335357b..99859cbf5 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ + cblas_samin.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ + cblas_damin.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ + cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ @@ -340,12 +342,12 @@ CXERBLAOBJ = \ CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ - cblas_zcopy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ + cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) From a7d004e820f1ccbc9f61b4b1353ccdb04f208690 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 17:55:42 +0100 Subject: [PATCH 10/11] Fix CBLAS prototype --- interface/max.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/interface/max.c b/interface/max.c index f05977448..6c7d32bd9 100644 --- a/interface/max.c +++ b/interface/max.c @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - +#endif + FLOAT ret; PRINT_DEBUG_CNAME; From 47bd06476312598eea694f19a00a9191041b1586 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 20:49:43 +0100 Subject: [PATCH 11/11] Fix names in build rules --- interface/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 99859cbf5..ad4a0fb89 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1541,10 +1541,10 @@ cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) -cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c +cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) -cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c +cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c @@ -1553,10 +1553,10 @@ cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c +cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c +cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c