diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index ab3e855c4..fb19c0918 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -15,6 +15,10 @@ DSWAPKERNEL = swap_thunderx2t99.S CSWAPKERNEL = swap_thunderx2t99.S ZSWAPKERNEL = swap_thunderx2t99.S +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c + + SNRM2KERNEL = snrm2_thunderx2t99.c CNRM2KERNEL = cnrm2_thunderx2t99.S diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c new file mode 100644 index 000000000..62429fd40 --- /dev/null +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -0,0 +1,380 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define N "x0" /* vector length */ +#define X "x1" /* "X" vector address */ +#define INC_X "x2" /* "X" stride */ +#define INDEX "x3" /* index of max/min value */ +#define Z "x4" /* vector index */ +#define J "x5" /* loop variable */ + +#if !defined(DOUBLE) +#define MAXF "s0" +#define TMPF0 "s1" +#define TMPF1 "s4" +#define N_KERNEL_SIZE "64" +#define SZ "4" +#define N_DIV_SHIFT "6" +#define N_REM_MASK "63" +#define INC_SHIFT "2" +#else +#define MAXF "d0" +#define TMPF0 "d1" +#define TMPF1 "d4" +#define N_KERNEL_SIZE "32" +#define SZ "8" +#define N_DIV_SHIFT "5" +#define N_REM_MASK "31" +#define INC_SHIFT "3" +#endif + +/******************************************************************************/ + +#if !defined(DOUBLE) +#define KERNEL_F \ + "ldp q2, q3, ["X"] \n" \ + "ldp q4, q5, ["X", #32] \n" \ + "ldp q6, q7, ["X", #64] \n" \ + "ldp q16, q17, ["X", #96] \n" \ + "ldp q18, q19, ["X", #128] \n" \ + "ldp q20, q21, ["X", #160] \n" \ + "ldp q22, q23, ["X", #192] \n" \ + "ldp q24, q25, ["X", #224] \n" \ + "add "X", "X", #256 \n" \ + "fabs v2.4s, v2.4s \n" \ + "fabs v3.4s, v3.4s \n" \ + "fabs v4.4s, v4.4s \n" \ + "fabs v5.4s, v5.4s \n" \ + "fabs v6.4s, v6.4s \n" \ + "fabs v7.4s, v7.4s \n" \ + "fabs v16.4s, v16.4s \n" \ + "fabs v17.4s, v17.4s \n" \ + "fabs v18.4s, v18.4s \n" \ + "fabs v19.4s, v19.4s \n" \ + "fabs v20.4s, v20.4s \n" \ + "fabs v21.4s, v21.4s \n" \ + "fabs v22.4s, v22.4s \n" \ + "fabs v23.4s, v23.4s \n" \ + "fabs v24.4s, v24.4s \n" \ + "fabs v25.4s, v25.4s \n" \ + "fmax v2.4s, v2.4s, v3.4s \n" \ + "fmax v4.4s, v4.4s, v5.4s \n" \ + "fmax v6.4s, v6.4s, v7.4s \n" \ + "fmax v16.4s, v16.4s, v17.4s \n" \ + "fmax v18.4s, v18.4s, v19.4s \n" \ + "fmax v20.4s, v20.4s, v21.4s \n" \ + "fmax v22.4s, v22.4s, v23.4s \n" \ + "fmax v24.4s, v24.4s, v25.4s \n" \ + "PRFM PLDL1KEEP, ["X", #1024] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ + "fmax v2.4s, v2.4s, v4.4s \n" \ + "fmax v6.4s, v6.4s, v16.4s \n" \ + "fmax v18.4s, v18.4s, v20.4s \n" \ + "fmax v22.4s, v22.4s, v24.4s \n" \ + "fmax v2.4s, v2.4s, v6.4s \n" \ + "fmax v18.4s, v18.4s, v22.4s \n" \ + "fmax v2.4s, v2.4s, v18.4s \n" \ + "fmaxv "TMPF0", v2.4s \n" \ + "fcmp "MAXF", "TMPF0" \n" \ + "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ + "csel "INDEX", "INDEX", "Z", ge \n" \ + "add "Z", "Z", #"N_KERNEL_SIZE" \n" + +#else + +#define KERNEL_F \ + "ldp q2, q3, ["X"] \n" \ + "ldp q4, q5, ["X", #32] \n" \ + "ldp q6, q7, ["X", #64] \n" \ + "ldp q16, q17, ["X", #96] \n" \ + "ldp q18, q19, ["X", #128] \n" \ + "ldp q20, q21, ["X", #160] \n" \ + "ldp q22, q23, ["X", #192] \n" \ + "ldp q24, q25, ["X", #224] \n" \ + "add "X", "X", #256 \n" \ + "fabs v2.2d, v2.2d \n" \ + "fabs v3.2d, v3.2d \n" \ + "fabs v4.2d, v4.2d \n" \ + "fabs v5.2d, v5.2d \n" \ + "fabs v6.2d, v6.2d \n" \ + "fabs v7.2d, v7.2d \n" \ + "fabs v16.2d, v16.2d \n" \ + "fabs v17.2d, v17.2d \n" \ + "fabs v18.2d, v18.2d \n" \ + "fabs v19.2d, v19.2d \n" \ + "fabs v20.2d, v20.2d \n" \ + "fabs v21.2d, v21.2d \n" \ + "fabs v22.2d, v22.2d \n" \ + "fabs v23.2d, v23.2d \n" \ + "fabs v24.2d, v24.2d \n" \ + "fabs v25.2d, v25.2d \n" \ + "fmax v2.2d, v2.2d, v3.2d \n" \ + "fmax v4.2d, v4.2d, v5.2d \n" \ + "fmax v6.2d, v6.2d, v7.2d \n" \ + "fmax v16.2d, v16.2d, v17.2d \n" \ + "fmax v18.2d, v18.2d, v19.2d \n" \ + "fmax v20.2d, v20.2d, v21.2d \n" \ + "fmax v22.2d, v22.2d, v23.2d \n" \ + "fmax v24.2d, v24.2d, v25.2d \n" \ + "PRFM PLDL1KEEP, ["X", #1024] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ + "fmax v2.2d, v2.2d, v4.2d \n" \ + "fmax v6.2d, v6.2d, v16.2d \n" \ + "fmax v18.2d, v18.2d, v20.2d \n" \ + "fmax v22.2d, v22.2d, v24.2d \n" \ + "fmax v2.2d, v2.2d, v6.2d \n" \ + "fmax v18.2d, v18.2d, v22.2d \n" \ + "fmax v2.2d, v2.2d, v18.2d \n" \ + "ins v3.d[0], v2.d[1] \n" \ + "fmax "TMPF0", d3, d2 \n" \ + "fcmp "MAXF", "TMPF0" \n" \ + "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ + "csel "INDEX", "INDEX", "Z", ge \n" \ + "add "Z", "Z", #"N_KERNEL_SIZE" \n" +#endif + +#define KERNEL_F_FINALIZE \ + "sub x6, "INDEX", #1 \n" \ + "lsl x6, x6, #"INC_SHIFT" \n" \ + "add x7, x7, x6 \n" \ + "mov x6, #0 \n" \ + "1: \n" \ + "add x6, x6, #1 \n" \ + "cmp x6, #"N_KERNEL_SIZE" \n" \ + "bge 2f \n" \ + "ldr "TMPF1", [x7] \n" \ + "fabs "TMPF1", "TMPF1" \n" \ + "fcmp "MAXF", "TMPF1" \n" \ + "add x7, x7, #"SZ" \n" \ + "bne 1b \n" \ + "2: \n" \ + "sub x6, x6, #1 \n" \ + "add "INDEX", "INDEX", x6 \n" + + +#define INIT \ + "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ + "ldr "MAXF", ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "mov "Z", #1 \n" \ + "mov "INDEX", "Z" \n" \ + "fabs "MAXF", "MAXF" \n" + + +#define KERNEL_S1 \ + "ldr "TMPF0", ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "add "Z", "Z", #1 \n" \ + "fabs "TMPF0", "TMPF0" \n" \ + "fcmp "MAXF", "TMPF0" \n" \ + "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ + "csel "INDEX", "INDEX", "Z", ge \n" + + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + +static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG index = 0; + + if ( n < 0 ) return index; + + __asm__ __volatile__ ( + " mov "N", %[N_] \n" + " mov "X", %[X_] \n" + " mov "INC_X", %[INCX_] \n" + + " cmp "N", xzr \n" + " ble .Liamax_kernel_zero \n" + " cmp "INC_X", xzr \n" + " ble .Liamax_kernel_zero \n" + " cmp "INC_X", #1 \n" + " bne .Liamax_kernel_S_BEGIN \n" + " mov x7, "X" \n" + + ".Liamax_kernel_F_BEGIN: \n" + " "INIT" \n" + " subs "N", "N", #1 \n" + " ble .Liamax_kernel_L999 \n" + " asr "J", "N", #"N_DIV_SHIFT" \n" + " cmp "J", xzr \n" + " beq .Liamax_kernel_F1 \n" + " add "Z", "Z", #1 \n" + + ".Liamax_kernel_F: \n" + " "KERNEL_F" \n" + " subs "J", "J", #1 \n" + " bne .Liamax_kernel_F \n" + " "KERNEL_F_FINALIZE" \n" + " sub "Z", "Z", #1 \n" + + ".Liamax_kernel_F1: \n" + " ands "J", "N", #"N_REM_MASK" \n" + " ble .Liamax_kernel_L999 \n" + + ".Liamax_kernel_F10: \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne .Liamax_kernel_F10 \n" + " b .Liamax_kernel_L999 \n" + + ".Liamax_kernel_S_BEGIN: \n" + " "INIT" \n" + " subs "N", "N", #1 \n" + " ble .Liamax_kernel_L999 \n" + " asr "J", "N", #2 \n" + " cmp "J", xzr \n" + " ble .Liamax_kernel_S1 \n" + + ".Liamax_kernel_S4: \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne .Liamax_kernel_S4 \n" + + ".Liamax_kernel_S1: \n" + " ands "J", "N", #3 \n" + " ble .Liamax_kernel_L999 \n" + + ".Liamax_kernel_S10: \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne .Liamax_kernel_S10 \n" + + ".Liamax_kernel_L999: \n" + " mov x0, "INDEX" \n" + " b .Liamax_kernel_DONE \n" + + ".Liamax_kernel_zero: \n" + " mov x0, xzr \n" + + ".Liamax_kernel_DONE: \n" + " mov %[INDEX_], "INDEX" \n" + + : [INDEX_] "=r" (index) //%0 + : [N_] "r" (n), //%1 + [X_] "r" (x), //%2 + [INCX_] "r" (inc_x) //%3 + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); + + return index; +} + +#if defined(SMP) +static int iamax_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *(BLASLONG *)result = iamax_compute(n, x, inc_x); + + return 0; +} +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + BLASLONG max_index = 0; + +#if defined(SMP) + nthreads = num_cpu_avail(1); + + if (inc_x == 0) + nthreads = 1; + + if (n <= 1000) + nthreads = 1; + + if (nthreads == 1) { + max_index = iamax_compute(n, x, inc_x); + } else { + BLASLONG i, width, cur_index; + int num_cpu; + int mode; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + FLOAT max = -1.0; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE; +#else + mode = BLAS_DOUBLE; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, NULL, 0, result, 0, + ( void *)iamax_thread_function, nthreads); + + num_cpu = 0; + i = n; + cur_index = 0; + + while (i > 0) { + FLOAT elem; + BLASLONG cur_max_index; + + cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2]; + elem = x[((cur_index + cur_max_index - 1) * inc_x)]; + elem = fabs(elem); + + if (elem >= max) { + max = elem; + max_index = cur_index + cur_max_index; + } + + width = blas_quickdivide(i + nthreads - num_cpu - 1, + nthreads - num_cpu); + i -= width; + cur_index += width; + num_cpu ++; + } + } +#else + max_index = iamax_compute(n, x, inc_x); +#endif + + return max_index; +}