diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5e2632574..826588318 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 20a4d9a7e..b61ecd427 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE