diff --git a/benchmark/Makefile b/benchmark/Makefile index d9ddb9042..6a7c54636 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -265,9 +265,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ ismax.goto idmax.goto \ isamin.goto idamin.goto icamin.goto izamin.goto \ ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ + samax.goto damax.goto scamax.goto dzamax.goto \ smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ + samin.goto damin.goto scamin.goto dzamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) @@ -2832,12 +2832,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME) damax.goto : damax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMAX ############################################## +scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMAX ############################################## +dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMAX ############################################## @@ -2856,12 +2856,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME) damin.goto : damin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMIN ############################################## +scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMIN ############################################## +dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMIN ############################################## @@ -3383,10 +3383,10 @@ samax.$(SUFFIX) : amax.c damax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camax.$(SUFFIX) : amax.c +scamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamax.$(SUFFIX) : amax.c +dzamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ @@ -3403,10 +3403,10 @@ samin.$(SUFFIX) : amin.c damin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camin.$(SUFFIX) : amin.c +scamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamin.$(SUFFIX) : amin.c +dzamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c365e9a75..e27ce3bee 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 68360faaf..f4429cfba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S index 0a4359002..c91a33006 100644 --- a/kernel/loongarch64/amin_lasx.S +++ b/kernel/loongarch64/amin_lasx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else xvldrepl.w VM0, X, 0 #endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S index 644caf43c..c3c3f4ae9 100644 --- a/kernel/loongarch64/amin_lsx.S +++ b/kernel/loongarch64/amin_lsx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else vldrepl.w VM0, X, 0 #endif - VFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 3 diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S index 7013430cb..f9a4e9012 100644 --- a/kernel/loongarch64/camax_lasx.S +++ b/kernel/loongarch64/camax_lasx.S @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmax.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmax.s VM1, x1, x2 - xvfmax.s VM0, x3, x4 - xvfmax.s VM0, VM0, VM1 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S index 2e55629de..cf46cb016 100644 --- a/kernel/loongarch64/camax_lsx.S +++ b/kernel/loongarch64/camax_lsx.S @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmax.s VM1, x1, VM1 - vfmax.s VM0, VM0, VM1 + addi.d I, I, -1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmax.s VM1, x1, x2 - vfmax.s VM0, x3, x4 - vfmax.s VM0, VM0, VM1 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S index d7931d30a..c1c4c98c8 100644 --- a/kernel/loongarch64/camin_lasx.S +++ b/kernel/loongarch64/camin_lasx.S @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else xvreplve0.w VM0, VM0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmin.s VM1, x1, x2 - xvfmin.s VM0, x3, x4 - xvfmin.s VM0, VM0, VM1 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S index e9ad6b04d..ff666ea8f 100644 --- a/kernel/loongarch64/camin_lsx.S +++ b/kernel/loongarch64/camin_lsx.S @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else vreplvei.w VM0, VM0, 0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif addi.d I, I, -1 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmin.s VM1, x1, VM1 - vfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmin.s VM1, x1, x2 - vfmin.s VM0, x3, x4 - vfmin.s VM0, VM0, VM1 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index c47954ce4..41829bd22 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -16,6 +16,7 @@ else () test_dnrm2.c test_swap.c test_zscal.c + test_amin.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index d0715c754..8acaa3ea9 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ + test_amin.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c85..e9775caf0 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2016, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -57,4 +57,31 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } #endif +#ifdef BUILD_COMPLEX +CTEST(amax, scamax){ + blasint N = 9, inc = 1; + float te_max = 0.0, tr_max = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + te_max = BLASFUNC(scamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amax, dzamax){ + blasint N = 9, inc = 1; + double te_max = 0.0, tr_max = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_max = BLASFUNC(dzamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} +#endif diff --git a/utest/test_amin.c b/utest/test_amin.c new file mode 100644 index 000000000..1305ab8ef --- /dev/null +++ b/utest/test_amin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(amin, samin){ + blasint N = 3, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(samin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_DOUBLE +CTEST(amin, damin){ + blasint N = 3, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(damin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX +CTEST(amin, scamin){ + blasint N = 9, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(scamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amin, dzamin){ + blasint N = 9, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(dzamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif