diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 712aa0e74..94dfd9d4b 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -50,4 +50,13 @@ DSWAPKERNEL = swap.S CSWAPKERNEL = swap.S ZSWAPKERNEL = swap.S +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S new file mode 100644 index 000000000..6279c2250 --- /dev/null +++ b/kernel/arm64/gemv_n.S @@ -0,0 +1,320 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define Y_IPTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ +#define Y_OPTR x13 /* loop Y vector address */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA s0 +#define TEMP s1 +#define TEMPV {v1.s}[0] +#define TMP1 s2 +#define TMPV1 {v2.s}[0] +#define TMP2 s3 +#define TMPV2 {v3.s}[0] +#define SZ 4 +#define SHZ 2 +#else +#define ALPHA d0 +#define TEMP d1 +#define TEMPV {v1.d}[0] +#define TMP1 d2 +#define TMPV1 {v2.d}[0] +#define TMP2 d3 +#define TMPV2 {v3.d}[0] +#define SZ 8 +#define SHZ 3 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro KERNEL_F16 +#if !defined(DOUBLE) + ld1 {v2.4s, v3.4s}, [A_PTR], #32 + ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 + fmla v4.4s, v1.4s, v2.4s + fmla v5.4s, v1.4s, v3.4s + st1 {v4.4s, v5.4s}, [Y_OPTR], #32 + + ld1 {v6.4s, v7.4s}, [A_PTR], #32 + ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 + fmla v8.4s, v1.4s, v6.4s + fmla v9.4s, v1.4s, v7.4s + st1 {v8.4s, v9.4s}, [Y_OPTR], #32 +#else //DOUBLE + ld1 {v2.2d, v3.2d}, [A_PTR], #32 + ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 + fmla v4.2d, v1.2d, v2.2d + fmla v5.2d, v1.2d, v3.2d + st1 {v4.2d, v5.2d}, [Y_OPTR], #32 + + ld1 {v6.2d, v7.2d}, [A_PTR], #32 + ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 + fmla v8.2d, v1.2d, v6.2d + fmla v9.2d, v1.2d, v7.2d + st1 {v8.2d, v9.2d}, [Y_OPTR], #32 + + ld1 {v10.2d, v11.2d}, [A_PTR], #32 + ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 + fmla v12.2d, v1.2d, v10.2d + fmla v13.2d, v1.2d, v11.2d + st1 {v12.2d, v13.2d}, [Y_OPTR], #32 + + ld1 {v14.2d, v15.2d}, [A_PTR], #32 + ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 + fmla v16.2d, v1.2d, v14.2d + fmla v17.2d, v1.2d, v15.2d + st1 {v16.2d, v17.2d}, [Y_OPTR], #32 +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [A_PTR], #16 + ld1 {v3.4s}, [Y_IPTR], #16 + fmla v3.4s, v1.4s, v2.4s + st1 {v3.4s}, [Y_OPTR], #16 +#else + ld1 {v2.2d}, [A_PTR], #16 + ld1 {v3.2d}, [Y_IPTR], #16 + fmla v3.2d, v1.2d, v2.2d + st1 {v3.2d}, [Y_OPTR], #16 + + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], #16 + fmla v5.2d, v1.2d, v4.2d + st1 {v5.2d}, [Y_OPTR], #16 +#endif +.endm + +.macro KERNEL_F1 + + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [Y_IPTR] + fmadd TMP2, TEMP, TMP1, TMP2 + st1 TMPV2, [Y_IPTR], #SZ + +.endm + +.macro INIT_S + + lsl INC_Y, INC_Y, #SHZ + +.endm + +.macro KERNEL_S1 + + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [Y_IPTR] + fmadd TMP2, TEMP, TMP1, TMP2 + st1 TMPV2, [Y_IPTR], INC_Y + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble gemv_n_kernel_L999 + cmp M, xzr + ble gemv_n_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_X, INC_X, #SHZ + mov J, N + + cmp INC_Y, #1 + bne gemv_n_kernel_S_BEGIN + +gemv_n_kernel_F_LOOP: + + ld1 TEMPV, [X], INC_X + fmul TEMP, ALPHA, TEMP +#if !defined(DOUBLE) + ins v1.s[1], v1.s[0] + ins v1.s[2], v1.s[0] + ins v1.s[3], v1.s[0] +#else + ins v1.d[1], v1.d[0] +#endif + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + +gemv_n_kernel_F32: + + asr I, M, #5 + cmp I, xzr + beq gemv_n_kernel_F4 + +gemv_n_kernel_F320: + + KERNEL_F16 + KERNEL_F16 + + subs I, I, #1 + bne gemv_n_kernel_F320 + +gemv_n_kernel_F4: + ands I, M, #31 + asr I, I, #2 + cmp I, xzr + beq gemv_n_kernel_F1 + +gemv_n_kernel_F40: + + KERNEL_F4 + + subs I, I, #1 + bne gemv_n_kernel_F40 + +gemv_n_kernel_F1: + ands I, M, #3 + ble gemv_n_kernel_F_END + +gemv_n_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne gemv_n_kernel_F10 + +gemv_n_kernel_F_END: + + add A, A, LDA + subs J, J, #1 + bne gemv_n_kernel_F_LOOP + + b gemv_n_kernel_L999 + +gemv_n_kernel_S_BEGIN: + + INIT_S + +gemv_n_kernel_S_LOOP: + + ld1 TEMPV, [X], INC_X + fmul TEMP, ALPHA, TEMP + mov A_PTR, A + mov Y_IPTR, Y + + asr I, M, #2 + cmp I, xzr + ble gemv_n_kernel_S1 + +gemv_n_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne gemv_n_kernel_S4 + +gemv_n_kernel_S1: + + ands I, M, #3 + ble gemv_n_kernel_S_END + +gemv_n_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne gemv_n_kernel_S10 + +gemv_n_kernel_S_END: + + add A, A, LDA + subs J, J, #1 + bne gemv_n_kernel_S_LOOP + +gemv_n_kernel_L999: + + mov w0, wzr + + RESTORE_REGS + + ret + + EPILOGUE diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S new file mode 100644 index 000000000..0145af621 --- /dev/null +++ b/kernel/arm64/gemv_t.S @@ -0,0 +1,347 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define X_PTR x10 /* loop X vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define ALPHA s0 +#define TEMP s1 +#define TEMP1 s2 +#define TEMP2 s3 +#define TEMP3 s4 +#define TEMPV {v1.s}[0] +#define TMP1 s2 +#define TMPV1 {v2.s}[0] +#define TMP2 s3 +#define TMPV2 {v3.s}[0] +#define SZ 4 +#define SHZ 2 +#else +#define REG0 xzr +#define ALPHA d0 +#define TEMP d1 +#define TEMP1 d2 +#define TEMP2 d3 +#define TEMP3 d4 +#define TEMPV {v1.d}[0] +#define TMP1 d2 +#define TMPV1 {v2.d}[0] +#define TMP2 d3 +#define TMPV2 {v3.d}[0] +#define SZ 8 +#define SHZ 3 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro KERNEL_F32 +#if !defined(DOUBLE) + ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 + ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 + fmla v1.4s, v5.4s, v9.4s + fmla v2.4s, v6.4s, v10.4s + fmla v3.4s, v7.4s, v11.4s + fmla v4.4s, v8.4s, v12.4s + + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 + ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 + fmla v1.4s, v13.4s, v17.4s + fmla v2.4s, v14.4s, v18.4s + fmla v3.4s, v15.4s, v19.4s + fmla v4.4s, v16.4s, v20.4s +#else + ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 + ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 + fmla v1.2d, v5.2d, v9.2d + fmla v2.2d, v6.2d, v10.2d + fmla v3.2d, v7.2d, v11.2d + fmla v4.2d, v8.2d, v12.2d + + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d + + ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 + ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 + fmla v1.2d, v5.2d, v9.2d + fmla v2.2d, v6.2d, v10.2d + fmla v3.2d, v7.2d, v11.2d + fmla v4.2d, v8.2d, v12.2d + + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d +#endif +.endm + +.macro KERNEL_F32_FINALIZE +#if !defined(DOUBLE) + fadd v1.4s, v1.4s, v2.4s + fadd v1.4s, v1.4s, v3.4s + fadd v1.4s, v1.4s, v4.4s +#else + fadd v1.2d, v1.2d, v2.2d + fadd v1.2d, v1.2d, v3.2d + fadd v1.2d, v1.2d, v4.2d +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [A_PTR], #16 + ld1 {v3.4s}, [X_PTR], #16 + fmla v1.4s, v2.4s, v3.4s +#else + ld1 {v2.2d}, [A_PTR], #16 + ld1 {v3.2d}, [X_PTR], #16 + fmla v1.2d, v2.2d, v3.2d + + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [X_PTR], #16 + fmla v1.2d, v4.2d, v5.2d +#endif +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v2.16b, v1.16b, v1.16b, #8 + fadd v1.2s, v1.2s, v2.2s + faddp TEMP, v1.2s +#else + faddp TEMP, v1.2d +#endif +.endm + +.macro KERNEL_F1 + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [X_PTR], #SZ + fmadd TEMP, TMP1, TMP2, TEMP +.endm + +.macro INIT_S + lsl INC_X, INC_X, #SHZ +.endm + +.macro KERNEL_S1 + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [X_PTR], INC_X + fmadd TEMP, TMP1, TMP2, TEMP +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble gemv_t_kernel_L999 + cmp M, xzr + ble gemv_t_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_Y, INC_Y, #SHZ + mov J, N + + cmp INC_X, #1 + bne gemv_t_kernel_S_BEGIN + +gemv_t_kernel_F_LOOP: + + fmov TEMP, REG0 + fmov TEMP1, REG0 + fmov TEMP2, REG0 + fmov TEMP3, REG0 + + mov A_PTR, A + mov X_PTR, X + +gemv_t_kernel_F32: + + asr I, M, #5 + cmp I, xzr + beq gemv_t_kernel_F4 + +gemv_t_kernel_F320: + + KERNEL_F32 + + subs I, I, #1 + bne gemv_t_kernel_F320 + + KERNEL_F32_FINALIZE + +gemv_t_kernel_F4: + ands I, M, #31 + asr I, I, #2 + cmp I, xzr + beq gemv_t_kernel_F1 + +gemv_t_kernel_F40: + + KERNEL_F4 + + subs I, I, #1 + bne gemv_t_kernel_F40 + +gemv_t_kernel_F1: + + KERNEL_F4_FINALIZE + + ands I, M, #3 + ble gemv_t_kernel_F_END + +gemv_t_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne gemv_t_kernel_F10 + +gemv_t_kernel_F_END: + + ld1 TMPV1, [Y] + add A, A, LDA + subs J, J, #1 + fmadd TMP1, ALPHA, TEMP, TMP1 + st1 TMPV1, [Y], INC_Y + bne gemv_t_kernel_F_LOOP + + b gemv_t_kernel_L999 + +gemv_t_kernel_S_BEGIN: + + INIT_S + +gemv_t_kernel_S_LOOP: + + fmov TEMP, REG0 + mov A_PTR, A + mov X_PTR, X + + asr I, M, #2 + cmp I, xzr + ble gemv_t_kernel_S1 + +gemv_t_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne gemv_t_kernel_S4 + +gemv_t_kernel_S1: + + ands I, M, #3 + ble gemv_t_kernel_S_END + +gemv_t_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne gemv_t_kernel_S10 + +gemv_t_kernel_S_END: + + ld1 TMPV1, [Y] + add A, A, LDA + subs J, J, #1 + fmadd TMP1, ALPHA, TEMP, TMP1 + st1 TMPV1, [Y], INC_Y + bne gemv_t_kernel_S_LOOP + +gemv_t_kernel_L999: + + RESTORE_REGS + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S new file mode 100644 index 000000000..9c5ec490c --- /dev/null +++ b/kernel/arm64/zgemv_n.S @@ -0,0 +1,514 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define Y_IPTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ +#define Y_OPTR x13 /* loop Y vector address */ +#define X_PTR x14 /* loop X vector address */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA_R s0 +#define ALPHA_I s1 +#define ALPHA_R_COPY s7 +#define ALPHA_I_COPY s8 +#define SHZ 3 +#else +#define ALPHA_R d0 +#define ALPHA_I d1 +#define ALPHA_R_COPY d7 +#define ALPHA_I_COPY d8 +#define SHZ 4 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + + +.macro INIT + /********** INIT FOR F4 LOOP **********/ + fmov ALPHA_R_COPY, ALPHA_R + fmov ALPHA_I_COPY, ALPHA_I +#if !defined(DOUBLE) + ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) + ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) + ins v7.d[1], v7.d[0] + ins v8.d[1], v8.d[0] +#else + ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) + ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) +#endif + + /******* INIT FOR F1 AND S1 LOOP ******/ +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) + fneg s2, ALPHA_I + ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) +#if !defined(XCONJ) + ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) +#endif +#else + ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) + fneg d2, ALPHA_I + ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) +#if !defined(XCONJ) + ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) +#endif +#endif +.endm + +.macro INIT_LOOP + /********** INIT_LOOP FOR F4 LOOP **********/ +#if !defined(DOUBLE) + ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] + ins v10.s[0], v9.s[1] + ins v9.s[1], v9.s[0] // [R(X), R(X)] + ins v10.s[1], v10.s[0] // [I(X), I(X)] + ins v9.d[1], v9.d[0] + ins v10.d[1], v10.d[0] +#if !defined(CONJ) +#if !defined(XCONJ) + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] + fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] +#else + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] + fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] +#endif +#else // CONJ +#if !defined(XCONJ) + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] + fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] + fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] +#else + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] + fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] + fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] +#endif +#endif // CONJ + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ + ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] + ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] + fmul v2.2s, v0.2s, v2.2s + fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] + ins v3.s[0], v2.s[1] +#if !defined(CONJ) +#if !defined(XCONJ) + fneg s4, s3 + ins v3.s[1], v4.s[0] + ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] + ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] +#else + fneg s4, s3 + ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] + ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] +#endif +#else // CONJ +#if !defined(XCONJ) + ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] + fneg s4, s2 + ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] +#else + fneg s3, s3 + ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] + fneg s4, s2 + ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] +#endif +#endif // CONJ + +#else // DOUBLE + + /********** INIT_LOOP FOR F4 LOOP **********/ + ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] + ins v10.d[0], v9.d[1] + ins v9.d[1], v9.d[0] // [R(X), R(X)] + ins v10.d[1], v10.d[0] // [I(X), I(X)] +#if !defined(CONJ) +#if !defined(XCONJ) + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] + fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] +#else + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] + fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] +#endif +#else // CONJ +#if !defined(XCONJ) + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] + fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] + fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] +#else + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] + fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] + fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] +#endif +#endif // CONJ + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ + ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] + ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] + ins v3.d[0], v2.d[1] // I(TEMP) +#if !defined(CONJ) +#if !defined(XCONJ) + fneg d4, d3 // -I(TEMP) + ins v3.d[1], v4.d[0] + ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] + ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] +#else + fneg d4, d3 // -I(TEMP) + ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] + ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] +#endif +#else // CONJ +#if !defined(XCONJ) + ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] + fneg d4, d2 // -R(TEMP) + ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] +#else + fneg d3, d3 // -I(TEMP) + ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] + fneg d4, d2 // -R(TEMP) + ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] +#endif +#endif // CONJ + +#endif // DOUBLE +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + + ld2 {v13.4s, v14.4s}, [A_PTR], #32 + ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] + fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] + fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] +#else + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] + fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] + fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] + fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] + fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] +#else + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] + fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] + fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 + +#else // DOUBLE + + ld2 {v13.2d, v14.2d}, [A_PTR], #32 + ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] + fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] + fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] + fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] + fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] + fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] + fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] + fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] + fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 + + ld2 {v17.2d, v18.2d}, [A_PTR], #32 + ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 + +#endif + +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v4.2s}, [A_PTR], #8 + ld1 {v5.2s}, [Y_IPTR], #8 + ext v6.8b, v4.8b, v4.8b, #4 + fmla v5.2s, v2.2s, v4.2s + fmla v5.2s, v3.2s, v6.2s + st1 {v5.2s}, [Y_OPTR], #8 +#else // DOUBLE + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], #16 + ext v6.16b, v4.16b, v4.16b, #8 + fmla v5.2d, v2.2d, v4.2d + fmla v5.2d, v3.2d, v6.2d + st1 {v5.2d}, [Y_OPTR], #16 +#endif +.endm + +.macro INIT_S + lsl INC_Y, INC_Y, #SHZ +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v4.2s}, [A_PTR], #8 + ld1 {v5.2s}, [Y_IPTR], INC_Y + ext v6.8b, v4.8b, v4.8b, #4 + fmla v5.2s, v2.2s, v4.2s + fmla v5.2s, v3.2s, v6.2s + st1 {v5.2s}, [Y_OPTR], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], INC_Y + ext v6.16b, v4.16b, v4.16b, #8 + fmla v5.2d, v2.2d, v4.2d + fmla v5.2d, v3.2d, v6.2d + st1 {v5.2d}, [Y_OPTR], INC_Y +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble zgemv_n_kernel_L999 + cmp M, xzr + ble zgemv_n_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_X, INC_X, #SHZ + mov J, N + + INIT + + cmp INC_Y, #1 + bne zgemv_n_kernel_S_BEGIN + +zgemv_n_kernel_F_LOOP: + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + mov X_PTR, X + add X, X, INC_X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + beq zgemv_n_kernel_F1 + +zgemv_n_kernel_F4: + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne zgemv_n_kernel_F4 + +zgemv_n_kernel_F1: + + ands I, M, #3 + ble zgemv_n_kernel_F_END + +zgemv_n_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zgemv_n_kernel_F10 + +zgemv_n_kernel_F_END: + + add A, A, LDA + subs J, J, #1 + bne zgemv_n_kernel_F_LOOP + + b zgemv_n_kernel_L999 + +zgemv_n_kernel_S_BEGIN: + + INIT_S + +zgemv_n_kernel_S_LOOP: + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + mov X_PTR, X + add X, X, INC_X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + ble zgemv_n_kernel_S1 + +zgemv_n_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zgemv_n_kernel_S4 + +zgemv_n_kernel_S1: + + ands I, M, #3 + ble zgemv_n_kernel_S_END + +zgemv_n_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zgemv_n_kernel_S10 + +zgemv_n_kernel_S_END: + + add A, A, LDA + subs J, J, #1 + bne zgemv_n_kernel_S_LOOP + +zgemv_n_kernel_L999: + RESTORE_REGS + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S new file mode 100644 index 000000000..1f0d698ed --- /dev/null +++ b/kernel/arm64/zgemv_t.S @@ -0,0 +1,448 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define X_PTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA_R s0 +#define ALPHA_I s1 +#define ALPHA_R_COPY s7 +#define ALPHA_I_COPY s8 +#define SHZ 3 +#else +#define ALPHA_R d0 +#define ALPHA_I d1 +#define ALPHA_R_COPY d7 +#define ALPHA_I_COPY d8 +#define SHZ 4 +#endif + +/******************************************************************************/ + + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro INIT +#if !defined(XCONJ) +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R + fneg s2, ALPHA_I + ins v1.s[1], v2.s[0] + ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I +#else + ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R + fneg d2, ALPHA_I + ins v1.d[1], v2.d[0] + ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I +#endif +#else // XCONJ +#if !defined(DOUBLE) + fneg s2, ALPHA_R + ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R + ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I +#else + fneg d2, ALPHA_R + ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R + ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I +#endif +#endif +.endm + +.macro INIT_LOOP + fmov d9, xzr // TEMP_R = [0, 0] + fmov d10, xzr // TEMP_I = [0, 0] +#if !defined(DOUBLE) +#else + fmov d15, xzr // TEMP_R = [0, 0] + fmov d16, xzr // TEMP_I = [0, 0] +#endif + + fmov d2, xzr // TEMP = [0, 0] +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + + ld2 {v11.4s, v12.4s}, [X_PTR], #32 + ld2 {v13.4s, v14.4s}, [A_PTR], #32 + +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] + fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] + fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] +#else + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] + fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] + fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] + fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] + fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] +#else + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] + fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] + fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] +#endif +#endif // CONJ + +#else // DOUBLE + ld2 {v11.2d, v12.2d}, [X_PTR], #32 + ld2 {v13.2d, v14.2d}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #512] +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] + fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] + fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] +#else + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] + fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] + fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] + fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] + fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] +#else + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] + fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] + fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] +#endif +#endif // CONJ + ld2 {v17.2d, v18.2d}, [X_PTR], #32 + ld2 {v19.2d, v20.2d}, [A_PTR], #32 + prfm PLDL1STRM, [A_PTR, #512] +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#else + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#else + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#endif +#endif // CONJ +#endif //DOUBLE +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v21.16b, v9.16b, v9.16b, #8 + fadd v9.2s, v9.2s, v21.2s + faddp s9, v9.2s + + ext v21.16b, v10.16b, v10.16b, #8 + fadd v10.2s, v10.2s, v21.2s + faddp s10, v10.2s + + ins v2.s[0], v9.s[0] + ins v2.s[1], v10.s[0] +#else + fadd v9.2d, v9.2d, v15.2d + fadd v10.2d, v10.2d, v16.2d + + faddp d9, v9.2d + faddp d10, v10.2d + + ins v2.d[0], v9.d[0] + ins v2.d[1], v10.d[0] +#endif +.endm + + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] + ld1 {v5.s}[0], [A_PTR], #4 // A1 + ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] + fneg s16, s5 + ins v5.s[1], v16.s[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] +#endif + ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] + fmla v2.2s, v4.2s, v6.2s + fmla v2.2s, v5.2s, v7.2s +#else // DOUBLE + ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] + ld1 {v5.d}[0], [A_PTR], #8 // A1 + ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] + fneg d16, d5 + ins v5.d[1], v16.d[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] +#endif + ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] + fmla v2.2d, v4.2d, v6.2d + fmla v2.2d, v5.2d, v7.2d +#endif +.endm + +.macro INIT_S + lsl INC_X, INC_X, #SHZ +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] + ld1 {v5.s}[0], [A_PTR], #4 // A1 + ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] + fneg s16, s5 + ins v5.s[1], v16.s[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] +#endif + ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] + fmla v2.2s, v4.2s, v6.2s + fmla v2.2s, v5.2s, v7.2s +#else // DOUBLE + ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] + ld1 {v5.d}[0], [A_PTR], #8 // A1 + ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] + fneg d16, d5 + ins v5.d[1], v16.d[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] +#endif + ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] + fmla v2.2d, v4.2d, v6.2d + fmla v2.2d, v5.2d, v7.2d +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + SAVE_REGS + + cmp N, xzr + ble zgemv_t_kernel_L999 + cmp M, xzr + ble zgemv_t_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_Y, INC_Y, #SHZ + mov J, N + + INIT + + cmp INC_X, #1 + bne zgemv_t_kernel_S_BEGIN + +zgemv_t_kernel_F_LOOP: + + mov A_PTR, A + mov X_PTR, X + + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + beq zgemv_t_kernel_F1 + +zgemv_t_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zgemv_t_kernel_F4 + + KERNEL_F4_FINALIZE + +zgemv_t_kernel_F1: + + ands I, M, #3 + ble zgemv_t_kernel_F_END + +zgemv_t_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zgemv_t_kernel_F10 + +zgemv_t_kernel_F_END: + +#if !defined(DOUBLE) + ld1 {v4.2s}, [Y] + ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] + fmla v4.2s, v0.2s, v2.2s + fmla v4.2s, v1.2s, v3.2s + st1 {v4.2s}, [Y], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [Y] + ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] + fmla v4.2d, v0.2d, v2.2d + fmla v4.2d, v1.2d, v3.2d + st1 {v4.2d}, [Y], INC_Y +#endif + + add A, A, LDA + subs J, J, #1 + bne zgemv_t_kernel_F_LOOP + + b zgemv_t_kernel_L999 + +zgemv_t_kernel_S_BEGIN: + + INIT_S + +zgemv_t_kernel_S_LOOP: + + mov A_PTR, A + mov X_PTR, X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + ble zgemv_t_kernel_S1 + +zgemv_t_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zgemv_t_kernel_S4 + +zgemv_t_kernel_S1: + + ands I, M, #3 + ble zgemv_t_kernel_S_END + +zgemv_t_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zgemv_t_kernel_S10 + +zgemv_t_kernel_S_END: + +#if !defined(DOUBLE) + ld1 {v4.2s}, [Y] + ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] + fmla v4.2s, v0.2s, v2.2s + fmla v4.2s, v1.2s, v3.2s + st1 {v4.2s}, [Y], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [Y] + ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] + fmla v4.2d, v0.2d, v2.2d + fmla v4.2d, v1.2d, v3.2d + st1 {v4.2d}, [Y], INC_Y +#endif + + add A, A, LDA + subs J, J, #1 + bne zgemv_t_kernel_S_LOOP + +zgemv_t_kernel_L999: + RESTORE_REGS + mov w0, wzr + ret + + EPILOGUE