Merge pull request #1108 from ashwinyes/develop_20170203_thunderx2t99

Optimized Implementations for ThunderX2T99
This commit is contained in:
Martin Kroeker 2017-02-28 16:02:19 +01:00 committed by GitHub
commit ffc1d6c468
16 changed files with 4213 additions and 844 deletions

View File

@ -751,6 +751,10 @@ void blas_set_parameter(void)
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;

View File

@ -42,9 +42,13 @@
#include "functable.h"
#endif
#if defined(THUNDERX2T99) || defined(VULCAN)
// Multithreaded swap gives performance benefits in ThunderX2T99
#else
// Disable multi-threading as it does not show any performance
// benefits. Keep the multi-threading code for the record.
#undef SMP
#endif
#ifndef CBLAS
@ -81,7 +85,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))

View File

@ -10,13 +10,29 @@ DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SNRM2KERNEL = snrm2_thunderx2t99.c
CNRM2KERNEL = cnrm2_thunderx2t99.S
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DAXPYKERNEL = daxpy_thunderx2t99.S
DDOTKERNEL = ddot_thunderx2t99.c
DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
@ -29,3 +45,7 @@ endif
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
endif
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
endif

View File

@ -1,228 +0,0 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define TMPF d16
#define SSQ s0
#define SSQD d0
/******************************************************************************/
.macro INIT
fmov SSQD, xzr
fmov d1, xzr
fmov d2, xzr
fmov d3, xzr
fmov d4, xzr
fmov d5, xzr
fmov d6, xzr
fmov d7, xzr
.endm
.macro KERNEL_F1
ldr TMPF, [X]
add X, X, #8
fcvtl v16.2d, v16.2s
fmla v0.2d, v16.2d, v16.2d
.endm
.macro KERNEL_F16
ldur q16, [X]
ldur q18, [X, #16]
ldur q20, [X, #32]
ldur q22, [X, #48]
ldur q24, [X, #64]
ldur q26, [X, #80]
ldur q28, [X, #96]
ldur q30, [X, #112]
add X, X, #128
fcvtl2 v17.2d, v16.4s
fcvtl v16.2d, v16.2s
fcvtl2 v19.2d, v18.4s
fcvtl v18.2d, v18.2s
fcvtl2 v21.2d, v20.4s
fcvtl v20.2d, v20.2s
fcvtl2 v23.2d, v22.4s
fcvtl v22.2d, v22.2s
fcvtl2 v25.2d, v24.4s
fcvtl v24.2d, v24.2s
fcvtl2 v27.2d, v26.4s
fcvtl v26.2d, v26.2s
fcvtl2 v29.2d, v28.4s
fcvtl v28.2d, v28.2s
fcvtl2 v31.2d, v30.4s
fcvtl v30.2d, v30.2s
fmla v0.2d, v16.2d, v16.2d
fmla v1.2d, v17.2d, v17.2d
fmla v2.2d, v18.2d, v18.2d
fmla v3.2d, v19.2d, v19.2d
fmla v4.2d, v20.2d, v20.2d
fmla v5.2d, v21.2d, v21.2d
fmla v6.2d, v22.2d, v22.2d
fmla v7.2d, v23.2d, v23.2d
fmla v0.2d, v24.2d, v24.2d
fmla v1.2d, v25.2d, v25.2d
fmla v2.2d, v26.2d, v26.2d
fmla v3.2d, v27.2d, v27.2d
fmla v4.2d, v28.2d, v28.2d
fmla v5.2d, v29.2d, v29.2d
fmla v6.2d, v30.2d, v30.2d
fmla v7.2d, v31.2d, v31.2d
prfm PLDL1KEEP, [X, #1024]
prfm PLDL1KEEP, [X, #1024+64]
.endm
.macro KERNEL_F16_FINALIZE
fadd v0.2d, v0.2d, v1.2d
fadd v2.2d, v2.2d, v3.2d
fadd v4.2d, v4.2d, v5.2d
fadd v6.2d, v6.2d, v7.2d
fadd v0.2d, v0.2d, v2.2d
fadd v4.2d, v4.2d, v6.2d
fadd v0.2d, v0.2d, v4.2d
.endm
.macro KERNEL_FINALIZE
faddp SSQD, v0.2d
.endm
.macro INIT_S
lsl INC_X, INC_X, #3
.endm
.macro KERNEL_S1
ldr TMPF, [X]
add X, X, INC_X
fcvtl v16.2d, v16.2s
fmla v0.2d, v16.2d, v16.2d
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
INIT
cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asr I, N, #4
cmp I, xzr
beq nrm2_kernel_S_BEGIN
.align 5
nrm2_kernel_F16:
KERNEL_F16
subs I, I, #1
bne nrm2_kernel_F16
KERNEL_F16_FINALIZE
nrm2_kernel_F1:
ands I, N, #15
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1
nrm2_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S4
nrm2_kernel_S1:
ands I, N, #3
ble nrm2_kernel_L999
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
KERNEL_FINALIZE
fsqrt SSQD, SSQD
fcvt SSQ, SSQD
ret
nrm2_kernel_zero:
fmov SSQ, wzr
ret
EPILOGUE

View File

@ -1,269 +0,0 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define Y "x3" /* "Y" vector address */
#define INC_Y "x4" /* "Y" stride */
#define J "x5" /* loop variable */
#define REG0 "xzr"
#define DOTF "d0"
#define TMPX "d16"
#define LD1VX "{v16.d}[0]"
#define TMPY "d24"
#define LD1VY "{v24.d}[0]"
#define SZ "8"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static FLOAT ddot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
FLOAT dot = 0.0 ;
if ( n < 0 ) return(dot);
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" mov "Y", %[Y_] \n"
" mov "INC_Y", %[INCY_] \n"
" fmov "DOTF", "REG0" \n"
" fmov d1, "REG0" \n"
" fmov d2, "REG0" \n"
" fmov d3, "REG0" \n"
" fmov d4, "REG0" \n"
" fmov d5, "REG0" \n"
" fmov d6, "REG0" \n"
" fmov d7, "REG0" \n"
" cmp "N", xzr \n"
" ble .Ldot_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n"
".Ldot_kernel_F_BEGIN: \n"
" asr "J", "N", #5 \n"
" cmp "J", xzr \n"
" beq .Ldot_kernel_F1 \n"
" .align 5 \n"
".Ldot_kernel_F32: \n"
" ldp q16, q17, ["X"] \n"
" ldp q24, q25, ["Y"] \n"
" ldp q18, q19, ["X", #32] \n"
" ldp q26, q27, ["Y", #32] \n"
" fmla v0.2d, v16.2d, v24.2d \n"
" fmla v1.2d, v17.2d, v25.2d \n"
" ldp q20, q21, ["X", #64] \n"
" ldp q28, q29, ["Y", #64] \n"
" fmla v2.2d, v18.2d, v26.2d \n"
" fmla v3.2d, v19.2d, v27.2d \n"
" ldp q22, q23, ["X", #96] \n"
" ldp q30, q31, ["Y", #96] \n"
" add "Y", "Y", #128 \n"
" add "X", "X", #128 \n"
" fmla v4.2d, v20.2d, v28.2d \n"
" fmla v5.2d, v21.2d, v29.2d \n"
" PRFM PLDL1KEEP, ["X", #896] \n"
" PRFM PLDL1KEEP, ["Y", #896] \n"
" PRFM PLDL1KEEP, ["X", #896+64] \n"
" PRFM PLDL1KEEP, ["Y", #896+64] \n"
" fmla v6.2d, v22.2d, v30.2d \n"
" fmla v7.2d, v23.2d, v31.2d \n"
" ldp q16, q17, ["X"] \n"
" ldp q24, q25, ["Y"] \n"
" ldp q18, q19, ["X", #32] \n"
" ldp q26, q27, ["Y", #32] \n"
" fmla v0.2d, v16.2d, v24.2d \n"
" fmla v1.2d, v17.2d, v25.2d \n"
" ldp q20, q21, ["X", #64] \n"
" ldp q28, q29, ["Y", #64] \n"
" fmla v2.2d, v18.2d, v26.2d \n"
" fmla v3.2d, v19.2d, v27.2d \n"
" ldp q22, q23, ["X", #96] \n"
" ldp q30, q31, ["Y", #96] \n"
" add "Y", "Y", #128 \n"
" add "X", "X", #128 \n"
" fmla v4.2d, v20.2d, v28.2d \n"
" fmla v5.2d, v21.2d, v29.2d \n"
" PRFM PLDL1KEEP, ["X", #896] \n"
" PRFM PLDL1KEEP, ["Y", #896] \n"
" PRFM PLDL1KEEP, ["X", #896+64] \n"
" PRFM PLDL1KEEP, ["Y", #896+64] \n"
" fmla v6.2d, v22.2d, v30.2d \n"
" fmla v7.2d, v23.2d, v31.2d \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_F32 \n"
" fadd v0.2d, v0.2d, v1.2d \n"
" fadd v2.2d, v2.2d, v3.2d \n"
" fadd v4.2d, v4.2d, v5.2d \n"
" fadd v6.2d, v6.2d, v7.2d \n"
" fadd v0.2d, v0.2d, v2.2d \n"
" fadd v4.2d, v4.2d, v6.2d \n"
" fadd v0.2d, v0.2d, v4.2d \n"
" faddp "DOTF", v0.2d \n"
".Ldot_kernel_F1: \n"
" ands "J", "N", #31 \n"
" ble .Ldot_kernel_L999 \n"
".Ldot_kernel_F10: \n"
" ldr "TMPX", ["X"] \n"
" ldr "TMPY", ["Y"] \n"
" add "X", "X", #"SZ" \n"
" add "Y", "Y", #"SZ" \n"
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_F10 \n"
" b .Ldot_kernel_L999 \n"
".Ldot_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", #3 \n"
" lsl "INC_Y", "INC_Y", #3 \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Ldot_kernel_S1 \n"
".Ldot_kernel_S4: \n"
" ld1 "LD1VX", ["X"], "INC_X" \n"
" ld1 "LD1VY", ["Y"], "INC_Y" \n"
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
" ld1 "LD1VX", ["X"], "INC_X" \n"
" ld1 "LD1VY", ["Y"], "INC_Y" \n"
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
" ld1 "LD1VX", ["X"], "INC_X" \n"
" ld1 "LD1VY", ["Y"], "INC_Y" \n"
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
" ld1 "LD1VX", ["X"], "INC_X" \n"
" ld1 "LD1VY", ["Y"], "INC_Y" \n"
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_S4 \n"
".Ldot_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble .Ldot_kernel_L999 \n"
".Ldot_kernel_S10: \n"
" ld1 "LD1VX", ["X"], "INC_X" \n"
" ld1 "LD1VY", ["Y"], "INC_Y" \n"
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_S10 \n"
".Ldot_kernel_L999: \n"
" fmov %[DOT_], "DOTF" \n"
: [DOT_] "=r" (dot) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x), //%3
[Y_] "r" (y), //%4
[INCY_] "r" (inc_y) //%5
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return(dot);
}
#if defined(SMP)
static int ddot_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*result = ddot_compute(n, x, inc_x, y, inc_y);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
FLOAT dot = 0.0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
dot = ddot_compute(n, x, inc_x, y, inc_y);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT *ptr;
mode = BLAS_DOUBLE | BLAS_REAL;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)ddot_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
dot = dot + (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
dot = ddot_compute(n, x, inc_x, y, inc_y);
#endif
return dot;
}

View File

@ -1,169 +0,0 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define TMPF d6
#define SSQ d0
#define TMPVF {v6.d}[0]
#define SZ 8
/******************************************************************************/
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
.macro KERNEL_F8
ld1 {v1.2d, v2.2d}, [X], #32
fmla v0.2d, v1.2d, v1.2d
fmla v5.2d, v2.2d, v2.2d
ld1 {v3.2d, v4.2d}, [X], #32
fmla v0.2d, v3.2d, v3.2d
fmla v5.2d, v4.2d, v4.2d
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro nrm2_kernel_F8_FINALIZE
fadd v0.2d, v0.2d, v5.2d
faddp SSQ, v0.2d
.endm
.macro INIT_S
lsl INC_X, INC_X, #3
ld1 TMPVF, [X], INC_X
fmul SSQ, TMPF, TMPF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SSQ, xzr
fmov d5, SSQ
cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq nrm2_kernel_F1_INIT
nrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
nrm2_kernel_F8_FINALIZE
nrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_F1_INIT:
b nrm2_kernel_F1
nrm2_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble nrm2_kernel_L999
asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1
nrm2_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S4
nrm2_kernel_S1:
ands I, N, #3
ble nrm2_kernel_L999
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret
nrm2_kernel_zero:
ret
EPILOGUE

View File

@ -0,0 +1,423 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#if !defined(DSDOT)
#define RETURN_TYPE FLOAT
#else
#define RETURN_TYPE double
#endif
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define Y "x3" /* "Y" vector address */
#define INC_Y "x4" /* "Y" stride */
#define J "x5" /* loop variable */
#if !defined(DOUBLE)
#if !defined(DSDOT)
#define REG0 "wzr"
#define DOTF "s0"
#define TMPX "s16"
#define TMPY "s24"
#define INC_SHIFT "2"
#define N_DIV_SHIFT "6"
#define N_REM_MASK "63"
#else
#define REG0 "xzr"
#define DOTF "d0"
#define TMPX "s16"
#define TMPX1 "d2"
#define TMPY "s24"
#define TMPY1 "d3"
#define INC_SHIFT "2"
#define N_DIV_SHIFT "4"
#define N_REM_MASK "15"
#endif
#else
#define REG0 "xzr"
#define DOTF "d0"
#define TMPX "d16"
#define TMPY "d24"
#define INC_SHIFT "3"
#define N_DIV_SHIFT "5"
#define N_REM_MASK "31"
#endif
#if !defined(DOUBLE)
#if !defined(DSDOT)
#define KERNEL_F1 \
" ldr "TMPX", ["X"] \n" \
" ldr "TMPY", ["Y"] \n" \
" add "X", "X", "INC_X" \n" \
" add "Y", "Y", "INC_Y" \n" \
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
#define KERNEL_F \
" ldp q16, q17, ["X"] \n" \
" ldp q24, q25, ["Y"] \n" \
" ldp q18, q19, ["X", #32] \n" \
" ldp q26, q27, ["Y", #32] \n" \
" fmla v0.4s, v16.4s, v24.4s \n" \
" fmla v1.4s, v17.4s, v25.4s \n" \
" ldp q20, q21, ["X", #64] \n" \
" ldp q28, q29, ["Y", #64] \n" \
" fmla v2.4s, v18.4s, v26.4s \n" \
" fmla v3.4s, v19.4s, v27.4s \n" \
" ldp q22, q23, ["X", #96] \n" \
" ldp q30, q31, ["Y", #96] \n" \
" add "Y", "Y", #128 \n" \
" add "X", "X", #128 \n" \
" fmla v4.4s, v20.4s, v28.4s \n" \
" fmla v5.4s, v21.4s, v29.4s \n" \
" PRFM PLDL1KEEP, ["X", #896] \n" \
" PRFM PLDL1KEEP, ["Y", #896] \n" \
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
" fmla v6.4s, v22.4s, v30.4s \n" \
" fmla v7.4s, v23.4s, v31.4s \n" \
" ldp q16, q17, ["X"] \n" \
" ldp q24, q25, ["Y"] \n" \
" ldp q18, q19, ["X", #32] \n" \
" ldp q26, q27, ["Y", #32] \n" \
" fmla v0.4s, v16.4s, v24.4s \n" \
" fmla v1.4s, v17.4s, v25.4s \n" \
" ldp q20, q21, ["X", #64] \n" \
" ldp q28, q29, ["Y", #64] \n" \
" fmla v2.4s, v18.4s, v26.4s \n" \
" fmla v3.4s, v19.4s, v27.4s \n" \
" ldp q22, q23, ["X", #96] \n" \
" ldp q30, q31, ["Y", #96] \n" \
" add "Y", "Y", #128 \n" \
" add "X", "X", #128 \n" \
" fmla v4.4s, v20.4s, v28.4s \n" \
" fmla v5.4s, v21.4s, v29.4s \n" \
" PRFM PLDL1KEEP, ["X", #896] \n" \
" PRFM PLDL1KEEP, ["Y", #896] \n" \
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
" fmla v6.4s, v22.4s, v30.4s \n" \
" fmla v7.4s, v23.4s, v31.4s \n"
#define KERNEL_F_FINALIZE \
" fadd v0.4s, v0.4s, v1.4s \n" \
" fadd v2.4s, v2.4s, v3.4s \n" \
" fadd v4.4s, v4.4s, v5.4s \n" \
" fadd v6.4s, v6.4s, v7.4s \n" \
" fadd v0.4s, v0.4s, v2.4s \n" \
" fadd v4.4s, v4.4s, v6.4s \n" \
" fadd v0.4s, v0.4s, v4.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n"
#else /* !defined(DSDOT) */
#define KERNEL_F1 \
" ldr "TMPX", ["X"] \n" \
" ldr "TMPY", ["Y"] \n" \
" add "X", "X", "INC_X" \n" \
" add "Y", "Y", "INC_Y" \n" \
" fcvt "TMPX1", "TMPX" \n" \
" fcvt "TMPY1", "TMPY" \n" \
" fmul "TMPX1", "TMPX1", "TMPY1" \n" \
" fadd "DOTF", "DOTF", "TMPX1" \n"
#define KERNEL_F \
" ldp q18, q19, ["X"] \n" \
" ldp q26, q27, ["Y"] \n" \
" fcvtl v16.2d, v18.2s \n" \
" fcvtl2 v17.2d, v18.4s \n" \
" fcvtl v18.2d, v19.2s \n" \
" fcvtl2 v19.2d, v19.4s \n" \
" fcvtl v24.2d, v26.2s \n" \
" fcvtl2 v25.2d, v26.4s \n" \
" fcvtl v26.2d, v27.2s \n" \
" fcvtl2 v27.2d, v27.4s \n" \
" ldp q22, q23, ["X", #32] \n" \
" ldp q30, q31, ["Y", #32] \n" \
" fcvtl v20.2d, v22.2s \n" \
" fcvtl2 v21.2d, v22.4s \n" \
" fcvtl v22.2d, v23.2s \n" \
" fcvtl2 v23.2d, v23.4s \n" \
" fcvtl v28.2d, v30.2s \n" \
" fcvtl2 v29.2d, v30.4s \n" \
" fcvtl v30.2d, v31.2s \n" \
" fcvtl2 v31.2d, v31.4s \n" \
" PRFM PLDL1KEEP, ["X", #896] \n" \
" PRFM PLDL1KEEP, ["Y", #896] \n" \
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
" fmla v0.2d, v16.2d, v24.2d \n" \
" fmla v1.2d, v17.2d, v25.2d \n" \
" fmla v2.2d, v18.2d, v26.2d \n" \
" fmla v3.2d, v19.2d, v27.2d \n" \
" add "Y", "Y", #64 \n" \
" add "X", "X", #64 \n" \
" fmla v4.2d, v20.2d, v28.2d \n" \
" fmla v5.2d, v21.2d, v29.2d \n" \
" fmla v6.2d, v22.2d, v30.2d \n" \
" fmla v7.2d, v23.2d, v31.2d \n"
#define KERNEL_F_FINALIZE \
" fadd v0.2d, v0.2d, v1.2d \n" \
" fadd v2.2d, v2.2d, v3.2d \n" \
" fadd v4.2d, v4.2d, v5.2d \n" \
" fadd v6.2d, v6.2d, v7.2d \n" \
" fadd v0.2d, v0.2d, v2.2d \n" \
" fadd v4.2d, v4.2d, v6.2d \n" \
" fadd v0.2d, v0.2d, v4.2d \n" \
" faddp "DOTF", v0.2d \n"
#endif /* !defined(DSDOT) */
#else /* !defined(DOUBLE) */
#define KERNEL_F1 \
" ldr "TMPX", ["X"] \n" \
" ldr "TMPY", ["Y"] \n" \
" add "X", "X", "INC_X" \n" \
" add "Y", "Y", "INC_Y" \n" \
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
#define KERNEL_F \
" ldp q16, q17, ["X"] \n" \
" ldp q24, q25, ["Y"] \n" \
" ldp q18, q19, ["X", #32] \n" \
" ldp q26, q27, ["Y", #32] \n" \
" fmla v0.2d, v16.2d, v24.2d \n" \
" fmla v1.2d, v17.2d, v25.2d \n" \
" ldp q20, q21, ["X", #64] \n" \
" ldp q28, q29, ["Y", #64] \n" \
" fmla v2.2d, v18.2d, v26.2d \n" \
" fmla v3.2d, v19.2d, v27.2d \n" \
" ldp q22, q23, ["X", #96] \n" \
" ldp q30, q31, ["Y", #96] \n" \
" add "Y", "Y", #128 \n" \
" add "X", "X", #128 \n" \
" fmla v4.2d, v20.2d, v28.2d \n" \
" fmla v5.2d, v21.2d, v29.2d \n" \
" PRFM PLDL1KEEP, ["X", #896] \n" \
" PRFM PLDL1KEEP, ["Y", #896] \n" \
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
" fmla v6.2d, v22.2d, v30.2d \n" \
" fmla v7.2d, v23.2d, v31.2d \n" \
" ldp q16, q17, ["X"] \n" \
" ldp q24, q25, ["Y"] \n" \
" ldp q18, q19, ["X", #32] \n" \
" ldp q26, q27, ["Y", #32] \n" \
" fmla v0.2d, v16.2d, v24.2d \n" \
" fmla v1.2d, v17.2d, v25.2d \n" \
" ldp q20, q21, ["X", #64] \n" \
" ldp q28, q29, ["Y", #64] \n" \
" fmla v2.2d, v18.2d, v26.2d \n" \
" fmla v3.2d, v19.2d, v27.2d \n" \
" ldp q22, q23, ["X", #96] \n" \
" ldp q30, q31, ["Y", #96] \n" \
" add "Y", "Y", #128 \n" \
" add "X", "X", #128 \n" \
" fmla v4.2d, v20.2d, v28.2d \n" \
" fmla v5.2d, v21.2d, v29.2d \n" \
" PRFM PLDL1KEEP, ["X", #896] \n" \
" PRFM PLDL1KEEP, ["Y", #896] \n" \
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
" fmla v6.2d, v22.2d, v30.2d \n" \
" fmla v7.2d, v23.2d, v31.2d \n"
#define KERNEL_F_FINALIZE \
" fadd v0.2d, v0.2d, v1.2d \n" \
" fadd v2.2d, v2.2d, v3.2d \n" \
" fadd v4.2d, v4.2d, v5.2d \n" \
" fadd v6.2d, v6.2d, v7.2d \n" \
" fadd v0.2d, v0.2d, v2.2d \n" \
" fadd v4.2d, v4.2d, v6.2d \n" \
" fadd v0.2d, v0.2d, v4.2d \n" \
" faddp "DOTF", v0.2d \n"
#endif /* !defined(DOUBLE) */
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
RETURN_TYPE dot = 0.0 ;
if ( n < 0 ) return dot;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" mov "Y", %[Y_] \n"
" mov "INC_Y", %[INCY_] \n"
" fmov "DOTF", "REG0" \n"
" fmov d1, xzr \n"
" fmov d2, xzr \n"
" fmov d3, xzr \n"
" fmov d4, xzr \n"
" fmov d5, xzr \n"
" fmov d6, xzr \n"
" fmov d7, xzr \n"
" cmp "N", xzr \n"
" ble .Ldot_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n"
".Ldot_kernel_F_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq .Ldot_kernel_F1 \n"
" .align 5 \n"
".Ldot_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_F \n"
" "KERNEL_F_FINALIZE" \n"
".Ldot_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n"
" ble .Ldot_kernel_L999 \n"
".Ldot_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_F10 \n"
" b .Ldot_kernel_L999 \n"
".Ldot_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Ldot_kernel_S1 \n"
".Ldot_kernel_S4: \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_S4 \n"
".Ldot_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble .Ldot_kernel_L999 \n"
".Ldot_kernel_S10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_S10 \n"
".Ldot_kernel_L999: \n"
" str "DOTF", [%[DOT_]] \n"
:
: [DOT_] "r" (&dot), //%0
[N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x), //%3
[Y_] "r" (y), //%4
[INCY_] "r" (inc_y) //%5
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return dot;
}
#if defined(SMP)
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
return 0;
}
#endif
RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
RETURN_TYPE dot = 0.0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
RETURN_TYPE *ptr;
#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_REAL;
#else
mode = BLAS_DOUBLE | BLAS_REAL;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)dot_thread_function, nthreads);
ptr = (RETURN_TYPE *)result;
for (i = 0; i < nthreads; i++) {
dot = dot + (*ptr);
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
dot = dot_compute(n, x, inc_x, y, inc_y);
#endif
return dot;
}

View File

@ -0,0 +1,384 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define J "x3" /* loop variable */
#define K "x4" /* loop variable */
#if !defined(COMPLEX)
#define INC_SHIFT "3"
#define SZ "8"
#else
#define INC_SHIFT "4"
#define SZ "16"
#endif
#define SSQ "d0"
#define SCALE "d1"
#define REGZERO "d5"
#define REGONE "d6"
#define CUR_MAX "d7"
#define CUR_MAXINV "d8"
#define CUR_MAXINV_V "v8.2d"
#define CUR_MAX_V "v8.2d"
static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
double *ssq, double *scale)
{
*ssq = 0.0;
*scale = 0.0;
if (n <= 0) return;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SCALE", xzr \n"
" fmov "SSQ", #1.0 \n"
" cmp "N", xzr \n"
" ble .Lnrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F_BEGIN: \n"
" fmov "REGZERO", xzr \n"
" fmov "REGONE", #1.0 \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" mov "J", "N" \n"
" cmp "J", xzr \n"
" beq .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F_ZERO_SKIP: \n"
" ldr d4, ["X"] \n"
" fcmp d4, "REGZERO" \n"
" bne .Lnrm2_kernel_F_INIT \n"
#if defined(COMPLEX)
" ldr d4, ["X", #8] \n"
" fcmp d4, "REGZERO" \n"
" bne .Lnrm2_kernel_F_INIT_I \n"
#endif
" add "X", "X", "INC_X" \n"
" subs "J", "J", #1 \n"
" beq .Lnrm2_kernel_L999 \n"
" b .Lnrm2_kernel_F_ZERO_SKIP \n"
".Lnrm2_kernel_F_INIT: \n"
" ldr d4, ["X"] \n"
" fabs d4, d4 \n"
" fmax "CUR_MAX", "SCALE", d4 \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
" fdiv d4, d4, "CUR_MAX" \n"
" fmul d4, d4, d4 \n"
" fadd "SSQ", "SSQ", d4 \n"
" fmov "SCALE", "CUR_MAX" \n"
#if defined(COMPLEX)
".Lnrm2_kernel_F_INIT_I: \n"
" ldr d3, ["X", #8] \n"
" fabs d3, d3 \n"
" fmax "CUR_MAX", "SCALE", d3 \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
" fdiv d3, d3, "CUR_MAX" \n"
" fmul d3, d3, d3 \n"
" fadd "SSQ", "SSQ", d3 \n"
" fmov "SCALE", "CUR_MAX" \n"
#endif
" add "X", "X", "INC_X" \n"
" subs "J", "J", #1 \n"
" beq .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F_START: \n"
" cmp "INC_X", #"SZ" \n"
" bne .Lnrm2_kernel_F1 \n"
" asr "K", "J", #4 \n"
" cmp "K", xzr \n"
" beq .Lnrm2_kernel_F1 \n"
".Lnrm2_kernel_F: \n"
" ldp q16, q17, ["X"] \n"
" ldp q18, q19, ["X", #32] \n"
" ldp q20, q21, ["X", #64] \n"
" ldp q22, q23, ["X", #96] \n"
" add "X", "X", #128 \n"
" fabs v16.2d, v16.2d \n"
" fabs v17.2d, v17.2d \n"
" fabs v18.2d, v18.2d \n"
" fabs v19.2d, v19.2d \n"
" fabs v20.2d, v20.2d \n"
" fabs v21.2d, v21.2d \n"
" fabs v22.2d, v22.2d \n"
" fabs v23.2d, v23.2d \n"
" fmaxp v24.2d, v16.2d, v17.2d \n"
" fmaxp v25.2d, v18.2d, v19.2d \n"
" fmaxp v26.2d, v20.2d, v21.2d \n"
" fmaxp v27.2d, v22.2d, v23.2d \n"
" fmaxp v24.2d, v24.2d, v25.2d \n"
" fmaxp v26.2d, v26.2d, v27.2d \n"
" fmaxp v24.2d, v24.2d, v26.2d \n"
" fmaxp v24.2d, v24.2d, v24.2d \n"
" fmax "CUR_MAX", "SCALE", d24 \n"
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
" //dup "CUR_MAX_V", v7.d[0] \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
" dup "CUR_MAXINV_V", v8.d[0] \n"
" fmul v16.2d, v16.2d, "CUR_MAXINV_V" \n"
" fmul v17.2d, v17.2d, "CUR_MAXINV_V" \n"
" fmul v18.2d, v18.2d, "CUR_MAXINV_V" \n"
" fmul v19.2d, v19.2d, "CUR_MAXINV_V" \n"
" fmul v20.2d, v20.2d, "CUR_MAXINV_V" \n"
" fmul v21.2d, v21.2d, "CUR_MAXINV_V" \n"
" fmul v22.2d, v22.2d, "CUR_MAXINV_V" \n"
" fmul v23.2d, v23.2d, "CUR_MAXINV_V" \n"
" //fdiv v16.2d, v16.2d, "CUR_MAX_V" \n"
" //fdiv v17.2d, v17.2d, "CUR_MAX_V" \n"
" //fdiv v18.2d, v18.2d, "CUR_MAX_V" \n"
" //fdiv v19.2d, v19.2d, "CUR_MAX_V" \n"
" //fdiv v20.2d, v20.2d, "CUR_MAX_V" \n"
" //fdiv v21.2d, v21.2d, "CUR_MAX_V" \n"
" //fdiv v22.2d, v22.2d, "CUR_MAX_V" \n"
" //fdiv v23.2d, v23.2d, "CUR_MAX_V" \n"
" fmul v24.2d, v16.2d, v16.2d \n"
" fmul v25.2d, v17.2d, v17.2d \n"
" fmul v26.2d, v18.2d, v18.2d \n"
" fmul v27.2d, v19.2d, v19.2d \n"
" fmla v24.2d, v20.2d, v20.2d \n"
" fmla v25.2d, v21.2d, v21.2d \n"
" fmla v26.2d, v22.2d, v22.2d \n"
" fmla v27.2d, v23.2d, v23.2d \n"
" fadd v24.2d, v24.2d, v25.2d \n"
" fadd v26.2d, v26.2d, v27.2d \n"
" fadd v24.2d, v24.2d, v26.2d \n"
" faddp d24, v24.2d \n"
" fadd "SSQ", "SSQ", d24 \n"
" fmov "SCALE", "CUR_MAX" \n"
#if defined(COMPLEX)
" ldp q16, q17, ["X"] \n"
" ldp q18, q19, ["X", #32] \n"
" ldp q20, q21, ["X", #64] \n"
" ldp q22, q23, ["X", #96] \n"
" add "X", "X", #128 \n"
" fabs v16.2d, v16.2d \n"
" fabs v17.2d, v17.2d \n"
" fabs v18.2d, v18.2d \n"
" fabs v19.2d, v19.2d \n"
" fabs v20.2d, v20.2d \n"
" fabs v21.2d, v21.2d \n"
" fabs v22.2d, v22.2d \n"
" fabs v23.2d, v23.2d \n"
" fmaxp v24.2d, v16.2d, v17.2d \n"
" fmaxp v25.2d, v18.2d, v19.2d \n"
" fmaxp v26.2d, v20.2d, v21.2d \n"
" fmaxp v27.2d, v22.2d, v23.2d \n"
" fmaxp v24.2d, v24.2d, v25.2d \n"
" fmaxp v26.2d, v26.2d, v27.2d \n"
" fmaxp v24.2d, v24.2d, v26.2d \n"
" fmaxp v24.2d, v24.2d, v24.2d \n"
" fmax "CUR_MAX", "SCALE", d24 \n"
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
" //dup "CUR_MAX_V", v7.d[0] \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
" dup "CUR_MAXINV_V", v8.d[0] \n"
" fmul v16.2d, v16.2d, "CUR_MAXINV_V" \n"
" fmul v17.2d, v17.2d, "CUR_MAXINV_V" \n"
" fmul v18.2d, v18.2d, "CUR_MAXINV_V" \n"
" fmul v19.2d, v19.2d, "CUR_MAXINV_V" \n"
" fmul v20.2d, v20.2d, "CUR_MAXINV_V" \n"
" fmul v21.2d, v21.2d, "CUR_MAXINV_V" \n"
" fmul v22.2d, v22.2d, "CUR_MAXINV_V" \n"
" fmul v23.2d, v23.2d, "CUR_MAXINV_V" \n"
" //fdiv v16.2d, v16.2d, "CUR_MAX_V" \n"
" //fdiv v17.2d, v17.2d, "CUR_MAX_V" \n"
" //fdiv v18.2d, v18.2d, "CUR_MAX_V" \n"
" //fdiv v19.2d, v19.2d, "CUR_MAX_V" \n"
" //fdiv v20.2d, v20.2d, "CUR_MAX_V" \n"
" //fdiv v21.2d, v21.2d, "CUR_MAX_V" \n"
" //fdiv v22.2d, v22.2d, "CUR_MAX_V" \n"
" //fdiv v23.2d, v23.2d, "CUR_MAX_V" \n"
" fmul v24.2d, v16.2d, v16.2d \n"
" fmul v25.2d, v17.2d, v17.2d \n"
" fmul v26.2d, v18.2d, v18.2d \n"
" fmul v27.2d, v19.2d, v19.2d \n"
" fmla v24.2d, v20.2d, v20.2d \n"
" fmla v25.2d, v21.2d, v21.2d \n"
" fmla v26.2d, v22.2d, v22.2d \n"
" fmla v27.2d, v23.2d, v23.2d \n"
" fadd v24.2d, v24.2d, v25.2d \n"
" fadd v26.2d, v26.2d, v27.2d \n"
" fadd v24.2d, v24.2d, v26.2d \n"
" faddp d24, v24.2d \n"
" fadd "SSQ", "SSQ", d24 \n"
" fmov "SCALE", "CUR_MAX" \n"
#endif
" subs "K", "K", #1 \n"
" bne .Lnrm2_kernel_F \n"
".Lnrm2_kernel_F_DONE: \n"
" ands "J", "J", #15 \n"
" beq .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F1: \n"
" ldr d4, ["X"] \n"
" fabs d4, d4 \n"
" fmax "CUR_MAX", "SCALE", d4 \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
" fdiv d4, d4, "CUR_MAX" \n"
" fmul d4, d4, d4 \n"
" fadd "SSQ", "SSQ", d4 \n"
" fmov "SCALE", "CUR_MAX" \n"
#if defined(COMPLEX)
" ldr d3, ["X", #8] \n"
" fabs d3, d3 \n"
" fmax "CUR_MAX", "SCALE", d3 \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
" fdiv d3, d3, "CUR_MAX" \n"
" fmul d3, d3, d3 \n"
" fadd "SSQ", "SSQ", d3 \n"
" fmov "SCALE", "CUR_MAX" \n"
#endif
" add "X", "X", "INC_X" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F1 \n"
".Lnrm2_kernel_L999: \n"
" str "SSQ", [%[SSQ_]] \n"
" str "SCALE", [%[SCALE_]] \n"
:
: [SSQ_] "r" (ssq), //%0
[SCALE_] "r" (scale), //%1
[N_] "r" (n), //%2
[X_] "r" (x), //%3
[INCX_] "r" (inc_x) //%4
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
);
}
#if defined(SMP)
static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3,
BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
{
nrm2_compute(n, x, inc_x, result, result + 1);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha[2];
#endif
FLOAT ssq, scale;
if (n <= 0 || inc_x <= 0) return 0.0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
nrm2_compute(n, x, inc_x, &ssq, &scale);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
double *ptr;
#if !defined(COMPLEX)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)nrm2_thread_function, nthreads);
scale = 0.0;
ssq = 1.0;
ptr = (double *)result;
for (i = 0; i < nthreads; i++) {
FLOAT cur_scale, cur_ssq;
cur_ssq = *ptr;
cur_scale = *(ptr + 1);
if (cur_scale != 0) {
if (cur_scale > scale) {
scale = (scale / cur_scale);
ssq = ssq * scale * scale;
ssq += cur_ssq;
scale = cur_scale;
} else {
cur_scale = (cur_scale / scale);
cur_ssq = cur_ssq * cur_scale * cur_scale;
ssq += cur_ssq;
}
}
ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
nrm2_compute(n, x, inc_x, &ssq, &scale);
#endif
ssq = sqrt(ssq) * scale;
return ssq;
}

View File

@ -36,54 +36,47 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define I "x5" /* loop variable */
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define J "x5" /* loop variable */
#define TMPF "s16"
#define TMPFD "d17"
#define SSQD "d0"
#define TMPF "d16"
#define SSQ "d0"
#define KERNEL_F1 \
"ldr "TMPF", ["X"], #4 \n" \
"fcvt "TMPFD", "TMPF" \n" \
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
#if !defined(COMPLEX)
#define N_DIV_SHIFT "5"
#define N_REM_MASK "31"
#define INC_SHIFT "3"
#else
#define N_DIV_SHIFT "4"
#define N_REM_MASK "15"
#define INC_SHIFT "4"
#endif
#define KERNEL_F32 \
"ldur q16, ["X"] \n" \
"ldur q18, ["X", #16] \n" \
"ldur q20, ["X", #32] \n" \
"ldur q22, ["X", #48] \n" \
"ldur q24, ["X", #64] \n" \
"ldur q26, ["X", #80] \n" \
"ldur q28, ["X", #96] \n" \
"ldur q30, ["X", #112] \n" \
"add "X", "X", #128 \n" \
"fcvtl2 v17.2d, v16.4s \n" \
"fcvtl v16.2d, v16.2s \n" \
"fcvtl2 v19.2d, v18.4s \n" \
"fcvtl v18.2d, v18.2s \n" \
"fcvtl2 v21.2d, v20.4s \n" \
"fcvtl v20.2d, v20.2s \n" \
"fcvtl2 v23.2d, v22.4s \n" \
"fcvtl v22.2d, v22.2s \n" \
"fcvtl2 v25.2d, v24.4s \n" \
"fcvtl v24.2d, v24.2s \n" \
"fcvtl2 v27.2d, v26.4s \n" \
"fcvtl v26.2d, v26.2s \n" \
"fcvtl2 v29.2d, v28.4s \n" \
"fcvtl v28.2d, v28.2s \n" \
"fcvtl2 v31.2d, v30.4s \n" \
"fcvtl v30.2d, v30.2s \n" \
#define KERNEL_F \
"ldp q16, q17, ["X"] \n" \
"ldp q18, q19, ["X", #32] \n" \
"ldp q20, q21, ["X", #64] \n" \
"ldp q22, q23, ["X", #96] \n" \
"ldp q24, q25, ["X", #128] \n" \
"ldp q26, q27, ["X", #160] \n" \
"ldp q28, q29, ["X", #192] \n" \
"ldp q30, q31, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fmla v0.2d, v16.2d, v16.2d \n" \
"fmla v1.2d, v17.2d, v17.2d \n" \
"fmla v2.2d, v18.2d, v18.2d \n" \
"fmla v3.2d, v19.2d, v19.2d \n" \
"prfm PLDL1KEEP, ["X", #1024] \n" \
"prfm PLDL1KEEP, ["X", #1024+64] \n" \
"fmla v4.2d, v20.2d, v20.2d \n" \
"fmla v5.2d, v21.2d, v21.2d \n" \
"fmla v6.2d, v22.2d, v22.2d \n" \
"fmla v7.2d, v23.2d, v23.2d \n" \
"prfm PLDL1KEEP, ["X", #1024+128] \n" \
"prfm PLDL1KEEP, ["X", #1024+192] \n" \
"fmla v0.2d, v24.2d, v24.2d \n" \
"fmla v1.2d, v25.2d, v25.2d \n" \
"fmla v2.2d, v26.2d, v26.2d \n" \
@ -91,11 +84,16 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
"fmla v4.2d, v28.2d, v28.2d \n" \
"fmla v5.2d, v29.2d, v29.2d \n" \
"fmla v6.2d, v30.2d, v30.2d \n" \
"fmla v7.2d, v31.2d, v31.2d \n" \
"prfm PLDL1KEEP, ["X", #1024] \n" \
"prfm PLDL1KEEP, ["X", #1024+64] \n"
"fmla v7.2d, v31.2d, v31.2d \n"
#define KERNEL_F32_FINALIZE \
#if !defined(COMPLEX)
#define KERNEL_F1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"fmadd "SSQ", "TMPF", "TMPF", "SSQ" \n"
#define KERNEL_F_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
@ -103,14 +101,28 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n" \
"faddp "SSQD", v0.2d \n"
"faddp "SSQ", v0.2d \n"
#define KERNEL_S1 \
"ldr "TMPF", ["X"] \n" \
#define KERNEL_FINALIZE \
""
#else
#define KERNEL_F1 \
"ldr q16, ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"fcvt "TMPFD", "TMPF" \n" \
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
"fmla v0.2d, v16.2d, v16.2d \n"
#define KERNEL_F_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
"fadd v6.2d, v6.2d, v7.2d \n" \
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n"
#define KERNEL_FINALIZE \
"faddp "SSQ", v0.2d \n"
#endif
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
@ -122,7 +134,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SSQD", xzr \n"
" fmov "SSQ", xzr \n"
" fmov d1, xzr \n"
" fmov d2, xzr \n"
" fmov d3, xzr \n"
@ -138,56 +150,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" bne .Lnrm2_kernel_S_BEGIN \n"
".Lnrm2_kernel_F_BEGIN: \n"
" asr "I", "N", #6 \n"
" cmp "I", xzr \n"
" beq .Lnrm2_kernel_S_BEGIN \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq .Lnrm2_kernel_F1 \n"
" .align 5 \n"
".Lnrm2_kernel_F64: \n"
" "KERNEL_F32" \n"
" "KERNEL_F32" \n"
" subs "I", "I", #1 \n"
" bne .Lnrm2_kernel_F64 \n"
" "KERNEL_F32_FINALIZE" \n"
".Lnrm2_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F \n"
" "KERNEL_F_FINALIZE" \n"
".Lnrm2_kernel_F1: \n"
" ands "I", "N", #63 \n"
" ands "J", "N", #"N_REM_MASK" \n"
" ble .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "I", "I", #1 \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F10 \n"
" b .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", #2 \n"
" asr "I", "N", #2 \n"
" cmp "I", xzr \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Lnrm2_kernel_S1 \n"
".Lnrm2_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "I", "I", #1 \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_S4 \n"
".Lnrm2_kernel_S1: \n"
" ands "I", "N", #3 \n"
" ands "J", "N", #3 \n"
" ble .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "I", "I", #1 \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_S10 \n"
".Lnrm2_kernel_L999: \n"
" fmov %[RET_], "SSQD" \n"
" "KERNEL_FINALIZE" \n"
" str "SSQ", [%[RET_]] \n"
: [RET_] "=r" (ret) //%0
: [N_] "r" (n), //%1
:
: [RET_] "r" (&ret), //%0
[N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
@ -214,13 +228,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
FLOAT dummy_alpha[2];
#endif
FLOAT nrm2 = 0.0;
double nrm2_double = 0.0;
if (n <= 0 || inc_x <= 0) return 0.0;
if (n == 1) return fabs(x[0]);
#if defined(SMP)
nthreads = num_cpu_avail(1);
@ -229,13 +241,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
nthreads = 1;
if (nthreads == 1) {
nrm2_double = nrm2_compute(n, x, inc_x);
nrm2 = nrm2_compute(n, x, inc_x);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
double *ptr;
mode = BLAS_SINGLE | BLAS_REAL;
#if !defined(COMPLEX)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
@ -243,14 +259,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
ptr = (double *)result;
for (i = 0; i < nthreads; i++) {
nrm2_double = nrm2_double + (*ptr) * (*ptr);
nrm2 = nrm2 + (*ptr);
ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
nrm2_double = nrm2_compute(n, x, inc_x);
nrm2 = nrm2_compute(n, x, inc_x);
#endif
nrm2 = sqrt(nrm2_double);
nrm2 = sqrt(nrm2);
return nrm2;
}

View File

@ -0,0 +1,380 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define INDEX "x3" /* index of max/min value */
#define Z "x4" /* vector index */
#define J "x5" /* loop variable */
#if !defined(DOUBLE)
#define MAXF "s0"
#define TMPF0 "s1"
#define TMPF1 "s4"
#define N_KERNEL_SIZE "64"
#define SZ "4"
#define N_DIV_SHIFT "6"
#define N_REM_MASK "63"
#define INC_SHIFT "2"
#else
#define MAXF "d0"
#define TMPF0 "d1"
#define TMPF1 "d4"
#define N_KERNEL_SIZE "32"
#define SZ "8"
#define N_DIV_SHIFT "5"
#define N_REM_MASK "31"
#define INC_SHIFT "3"
#endif
/******************************************************************************/
#if !defined(DOUBLE)
#define KERNEL_F \
"ldp q2, q3, ["X"] \n" \
"ldp q4, q5, ["X", #32] \n" \
"ldp q6, q7, ["X", #64] \n" \
"ldp q16, q17, ["X", #96] \n" \
"ldp q18, q19, ["X", #128] \n" \
"ldp q20, q21, ["X", #160] \n" \
"ldp q22, q23, ["X", #192] \n" \
"ldp q24, q25, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fabs v2.4s, v2.4s \n" \
"fabs v3.4s, v3.4s \n" \
"fabs v4.4s, v4.4s \n" \
"fabs v5.4s, v5.4s \n" \
"fabs v6.4s, v6.4s \n" \
"fabs v7.4s, v7.4s \n" \
"fabs v16.4s, v16.4s \n" \
"fabs v17.4s, v17.4s \n" \
"fabs v18.4s, v18.4s \n" \
"fabs v19.4s, v19.4s \n" \
"fabs v20.4s, v20.4s \n" \
"fabs v21.4s, v21.4s \n" \
"fabs v22.4s, v22.4s \n" \
"fabs v23.4s, v23.4s \n" \
"fabs v24.4s, v24.4s \n" \
"fabs v25.4s, v25.4s \n" \
"fmax v2.4s, v2.4s, v3.4s \n" \
"fmax v4.4s, v4.4s, v5.4s \n" \
"fmax v6.4s, v6.4s, v7.4s \n" \
"fmax v16.4s, v16.4s, v17.4s \n" \
"fmax v18.4s, v18.4s, v19.4s \n" \
"fmax v20.4s, v20.4s, v21.4s \n" \
"fmax v22.4s, v22.4s, v23.4s \n" \
"fmax v24.4s, v24.4s, v25.4s \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fmax v2.4s, v2.4s, v4.4s \n" \
"fmax v6.4s, v6.4s, v16.4s \n" \
"fmax v18.4s, v18.4s, v20.4s \n" \
"fmax v22.4s, v22.4s, v24.4s \n" \
"fmax v2.4s, v2.4s, v6.4s \n" \
"fmax v18.4s, v18.4s, v22.4s \n" \
"fmax v2.4s, v2.4s, v18.4s \n" \
"fmaxv "TMPF0", v2.4s \n" \
"fcmp "MAXF", "TMPF0" \n" \
"fcsel "MAXF", "MAXF", "TMPF0", ge \n" \
"csel "INDEX", "INDEX", "Z", ge \n" \
"add "Z", "Z", #"N_KERNEL_SIZE" \n"
#else
#define KERNEL_F \
"ldp q2, q3, ["X"] \n" \
"ldp q4, q5, ["X", #32] \n" \
"ldp q6, q7, ["X", #64] \n" \
"ldp q16, q17, ["X", #96] \n" \
"ldp q18, q19, ["X", #128] \n" \
"ldp q20, q21, ["X", #160] \n" \
"ldp q22, q23, ["X", #192] \n" \
"ldp q24, q25, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fabs v2.2d, v2.2d \n" \
"fabs v3.2d, v3.2d \n" \
"fabs v4.2d, v4.2d \n" \
"fabs v5.2d, v5.2d \n" \
"fabs v6.2d, v6.2d \n" \
"fabs v7.2d, v7.2d \n" \
"fabs v16.2d, v16.2d \n" \
"fabs v17.2d, v17.2d \n" \
"fabs v18.2d, v18.2d \n" \
"fabs v19.2d, v19.2d \n" \
"fabs v20.2d, v20.2d \n" \
"fabs v21.2d, v21.2d \n" \
"fabs v22.2d, v22.2d \n" \
"fabs v23.2d, v23.2d \n" \
"fabs v24.2d, v24.2d \n" \
"fabs v25.2d, v25.2d \n" \
"fmax v2.2d, v2.2d, v3.2d \n" \
"fmax v4.2d, v4.2d, v5.2d \n" \
"fmax v6.2d, v6.2d, v7.2d \n" \
"fmax v16.2d, v16.2d, v17.2d \n" \
"fmax v18.2d, v18.2d, v19.2d \n" \
"fmax v20.2d, v20.2d, v21.2d \n" \
"fmax v22.2d, v22.2d, v23.2d \n" \
"fmax v24.2d, v24.2d, v25.2d \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fmax v2.2d, v2.2d, v4.2d \n" \
"fmax v6.2d, v6.2d, v16.2d \n" \
"fmax v18.2d, v18.2d, v20.2d \n" \
"fmax v22.2d, v22.2d, v24.2d \n" \
"fmax v2.2d, v2.2d, v6.2d \n" \
"fmax v18.2d, v18.2d, v22.2d \n" \
"fmax v2.2d, v2.2d, v18.2d \n" \
"ins v3.d[0], v2.d[1] \n" \
"fmax "TMPF0", d3, d2 \n" \
"fcmp "MAXF", "TMPF0" \n" \
"fcsel "MAXF", "MAXF", "TMPF0", ge \n" \
"csel "INDEX", "INDEX", "Z", ge \n" \
"add "Z", "Z", #"N_KERNEL_SIZE" \n"
#endif
#define KERNEL_F_FINALIZE \
"sub x6, "INDEX", #1 \n" \
"lsl x6, x6, #"INC_SHIFT" \n" \
"add x7, x7, x6 \n" \
"mov x6, #0 \n" \
"1: \n" \
"add x6, x6, #1 \n" \
"cmp x6, #"N_KERNEL_SIZE" \n" \
"bge 2f \n" \
"ldr "TMPF1", [x7] \n" \
"fabs "TMPF1", "TMPF1" \n" \
"fcmp "MAXF", "TMPF1" \n" \
"add x7, x7, #"SZ" \n" \
"bne 1b \n" \
"2: \n" \
"sub x6, x6, #1 \n" \
"add "INDEX", "INDEX", x6 \n"
#define INIT \
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \
"ldr "MAXF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"mov "Z", #1 \n" \
"mov "INDEX", "Z" \n" \
"fabs "MAXF", "MAXF" \n"
#define KERNEL_S1 \
"ldr "TMPF0", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"add "Z", "Z", #1 \n" \
"fabs "TMPF0", "TMPF0" \n" \
"fcmp "MAXF", "TMPF0" \n" \
"fcsel "MAXF", "MAXF", "TMPF0", ge \n" \
"csel "INDEX", "INDEX", "Z", ge \n"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG index = 0;
if ( n < 0 ) return index;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" cmp "N", xzr \n"
" ble .Liamax_kernel_zero \n"
" cmp "INC_X", xzr \n"
" ble .Liamax_kernel_zero \n"
" cmp "INC_X", #1 \n"
" bne .Liamax_kernel_S_BEGIN \n"
" mov x7, "X" \n"
".Liamax_kernel_F_BEGIN: \n"
" "INIT" \n"
" subs "N", "N", #1 \n"
" ble .Liamax_kernel_L999 \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq .Liamax_kernel_F1 \n"
" add "Z", "Z", #1 \n"
".Liamax_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne .Liamax_kernel_F \n"
" "KERNEL_F_FINALIZE" \n"
" sub "Z", "Z", #1 \n"
".Liamax_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n"
" ble .Liamax_kernel_L999 \n"
".Liamax_kernel_F10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Liamax_kernel_F10 \n"
" b .Liamax_kernel_L999 \n"
".Liamax_kernel_S_BEGIN: \n"
" "INIT" \n"
" subs "N", "N", #1 \n"
" ble .Liamax_kernel_L999 \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Liamax_kernel_S1 \n"
".Liamax_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Liamax_kernel_S4 \n"
".Liamax_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble .Liamax_kernel_L999 \n"
".Liamax_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Liamax_kernel_S10 \n"
".Liamax_kernel_L999: \n"
" mov x0, "INDEX" \n"
" b .Liamax_kernel_DONE \n"
".Liamax_kernel_zero: \n"
" mov x0, xzr \n"
".Liamax_kernel_DONE: \n"
" mov %[INDEX_], "INDEX" \n"
: [INDEX_] "=r" (index) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return index;
}
#if defined(SMP)
static int iamax_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*(BLASLONG *)result = iamax_compute(n, x, inc_x);
return 0;
}
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
BLASLONG max_index = 0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
max_index = iamax_compute(n, x, inc_x);
} else {
BLASLONG i, width, cur_index;
int num_cpu;
int mode;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT max = -1.0;
#if !defined(DOUBLE)
mode = BLAS_SINGLE;
#else
mode = BLAS_DOUBLE;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)iamax_thread_function, nthreads);
num_cpu = 0;
i = n;
cur_index = 0;
while (i > 0) {
FLOAT elem;
BLASLONG cur_max_index;
cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2];
elem = x[((cur_index + cur_max_index - 1) * inc_x)];
elem = fabs(elem);
if (elem >= max) {
max = elem;
max_index = cur_index + cur_max_index;
}
width = blas_quickdivide(i + nthreads - num_cpu - 1,
nthreads - num_cpu);
i -= width;
cur_index += width;
num_cpu ++;
}
}
#else
max_index = iamax_compute(n, x, inc_x);
#endif
return max_index;
}

View File

@ -0,0 +1,390 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define INDEX "x3" /* index of max/min value */
#define Z "x4" /* vector index */
#define J "x5" /* loop variable */
#if !defined(DOUBLE)
#define MAXF "s0"
#define TMPF0 "s1"
#define TMPF0V "v1.2s"
#define TMPF1 "d4"
#define TMPF1V "v4.2s"
#define N_KERNEL_SIZE "32"
#define SZ "8"
#define N_DIV_SHIFT "5"
#define N_REM_MASK "31"
#define INC_SHIFT "3"
#else
#define MAXF "d0"
#define TMPF0 "d1"
#define TMPF0V "v1.2d"
#define TMPF1 "q4"
#define TMPF1V "v4.2d"
#define N_KERNEL_SIZE "16"
#define SZ "16"
#define N_DIV_SHIFT "4"
#define N_REM_MASK "15"
#define INC_SHIFT "4"
#endif
/******************************************************************************/
#if !defined(DOUBLE)
#define KERNEL_F \
"ldp q2, q3, ["X"] \n" \
"ldp q4, q5, ["X", #32] \n" \
"ldp q6, q7, ["X", #64] \n" \
"ldp q16, q17, ["X", #96] \n" \
"ldp q18, q19, ["X", #128] \n" \
"ldp q20, q21, ["X", #160] \n" \
"ldp q22, q23, ["X", #192] \n" \
"ldp q24, q25, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fabs v2.4s, v2.4s \n" \
"fabs v3.4s, v3.4s \n" \
"fabs v4.4s, v4.4s \n" \
"fabs v5.4s, v5.4s \n" \
"fabs v6.4s, v6.4s \n" \
"fabs v7.4s, v7.4s \n" \
"fabs v16.4s, v16.4s \n" \
"fabs v17.4s, v17.4s \n" \
"fabs v18.4s, v18.4s \n" \
"fabs v19.4s, v19.4s \n" \
"fabs v20.4s, v20.4s \n" \
"fabs v21.4s, v21.4s \n" \
"fabs v22.4s, v22.4s \n" \
"fabs v23.4s, v23.4s \n" \
"fabs v24.4s, v24.4s \n" \
"fabs v25.4s, v25.4s \n" \
"faddp v2.4s, v2.4s, v3.4s \n" \
"faddp v4.4s, v4.4s, v5.4s \n" \
"faddp v6.4s, v6.4s, v7.4s \n" \
"faddp v16.4s, v16.4s, v17.4s \n" \
"faddp v18.4s, v18.4s, v19.4s \n" \
"faddp v20.4s, v20.4s, v21.4s \n" \
"faddp v22.4s, v22.4s, v23.4s \n" \
"faddp v24.4s, v24.4s, v25.4s \n" \
"fmax v2.4s, v2.4s, v4.4s \n" \
"fmax v6.4s, v6.4s, v16.4s \n" \
"fmax v18.4s, v18.4s, v20.4s \n" \
"fmax v22.4s, v22.4s, v24.4s \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fmax v2.4s, v2.4s, v6.4s \n" \
"fmax v18.4s, v18.4s, v22.4s \n" \
"fmax v2.4s, v2.4s, v18.4s \n" \
"fmaxv "TMPF0", v2.4s \n" \
"fcmp "MAXF", "TMPF0" \n" \
"fcsel "MAXF", "MAXF", "TMPF0", ge \n" \
"csel "INDEX", "INDEX", "Z", ge \n" \
"add "Z", "Z", #"N_KERNEL_SIZE" \n"
#else
#define KERNEL_F \
"ldp q2, q3, ["X"] \n" \
"ldp q4, q5, ["X", #32] \n" \
"ldp q6, q7, ["X", #64] \n" \
"ldp q16, q17, ["X", #96] \n" \
"ldp q18, q19, ["X", #128] \n" \
"ldp q20, q21, ["X", #160] \n" \
"ldp q22, q23, ["X", #192] \n" \
"ldp q24, q25, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fabs v2.2d, v2.2d \n" \
"fabs v3.2d, v3.2d \n" \
"fabs v4.2d, v4.2d \n" \
"fabs v5.2d, v5.2d \n" \
"fabs v6.2d, v6.2d \n" \
"fabs v7.2d, v7.2d \n" \
"fabs v16.2d, v16.2d \n" \
"fabs v17.2d, v17.2d \n" \
"fabs v18.2d, v18.2d \n" \
"fabs v19.2d, v19.2d \n" \
"fabs v20.2d, v20.2d \n" \
"fabs v21.2d, v21.2d \n" \
"fabs v22.2d, v22.2d \n" \
"fabs v23.2d, v23.2d \n" \
"fabs v24.2d, v24.2d \n" \
"fabs v25.2d, v25.2d \n" \
"faddp v2.2d, v2.2d, v3.2d \n" \
"faddp v4.2d, v4.2d, v5.2d \n" \
"faddp v6.2d, v6.2d, v7.2d \n" \
"faddp v16.2d, v16.2d, v17.2d \n" \
"faddp v18.2d, v18.2d, v19.2d \n" \
"faddp v20.2d, v20.2d, v21.2d \n" \
"faddp v22.2d, v22.2d, v23.2d \n" \
"faddp v24.2d, v24.2d, v25.2d \n" \
"fmax v2.2d, v2.2d, v4.2d \n" \
"fmax v6.2d, v6.2d, v16.2d \n" \
"fmax v18.2d, v18.2d, v20.2d \n" \
"fmax v22.2d, v22.2d, v24.2d \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fmax v2.2d, v2.2d, v6.2d \n" \
"fmax v18.2d, v18.2d, v22.2d \n" \
"fmax v2.2d, v2.2d, v18.2d \n" \
"ins v3.d[0], v2.d[1] \n" \
"fmax "TMPF0", d3, d2 \n" \
"fcmp "MAXF", "TMPF0" \n" \
"fcsel "MAXF", "MAXF", "TMPF0", ge \n" \
"csel "INDEX", "INDEX", "Z", ge \n" \
"add "Z", "Z", #"N_KERNEL_SIZE" \n"
#endif
#define KERNEL_F_FINALIZE \
"sub x6, "INDEX", #1 \n" \
"lsl x6, x6, #"INC_SHIFT" \n" \
"add x7, x7, x6 \n" \
"mov x6, #0 \n" \
"1: \n" \
"add x6, x6, #1 \n" \
"cmp x6, #"N_KERNEL_SIZE" \n" \
"bge 2f \n" \
"ldr "TMPF1", [x7] \n" \
"fabs "TMPF1V", "TMPF1V" \n" \
"faddp "TMPF0V", "TMPF1V", "TMPF1V" \n" \
"fcmp "MAXF", "TMPF0" \n" \
"add x7, x7, #"SZ" \n" \
"bne 1b \n" \
"2: \n" \
"sub x6, x6, #1 \n" \
"add "INDEX", "INDEX", x6 \n"
#define INIT \
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \
"ldr "TMPF1", ["X"] \n" \
"fabs "TMPF1V", "TMPF1V" \n" \
"faddp "TMPF0V", "TMPF1V", "TMPF1V" \n" \
"fmov "MAXF" , "TMPF0" \n" \
"add "X", "X", "INC_X" \n" \
"mov "Z", #1 \n" \
"mov "INDEX", "Z" \n" \
"fabs "MAXF", "MAXF" \n"
#define KERNEL_S1 \
"ldr "TMPF1", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"add "Z", "Z", #1 \n" \
"fabs "TMPF1V", "TMPF1V" \n" \
"faddp "TMPF0V", "TMPF1V", "TMPF1V" \n" \
"fcmp "MAXF", "TMPF0" \n" \
"fcsel "MAXF", "MAXF", "TMPF0", ge \n" \
"csel "INDEX", "INDEX", "Z", ge \n"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG index = 0;
if ( n < 0 ) return index;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" cmp "N", xzr \n"
" ble .Lizamax_kernel_zero \n"
" cmp "INC_X", xzr \n"
" ble .Lizamax_kernel_zero \n"
" cmp "INC_X", #1 \n"
" bne .Lizamax_kernel_S_BEGIN \n"
" mov x7, "X" \n"
".Lizamax_kernel_F_BEGIN: \n"
" "INIT" \n"
" subs "N", "N", #1 \n"
" ble .Lizamax_kernel_L999 \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq .Lizamax_kernel_F1 \n"
" add "Z", "Z", #1 \n"
".Lizamax_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne .Lizamax_kernel_F \n"
" "KERNEL_F_FINALIZE" \n"
" sub "Z", "Z", #1 \n"
".Lizamax_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n"
" ble .Lizamax_kernel_L999 \n"
".Lizamax_kernel_F10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Lizamax_kernel_F10 \n"
" b .Lizamax_kernel_L999 \n"
".Lizamax_kernel_S_BEGIN: \n"
" "INIT" \n"
" subs "N", "N", #1 \n"
" ble .Lizamax_kernel_L999 \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Lizamax_kernel_S1 \n"
".Lizamax_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Lizamax_kernel_S4 \n"
".Lizamax_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble .Lizamax_kernel_L999 \n"
".Lizamax_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Lizamax_kernel_S10 \n"
".Lizamax_kernel_L999: \n"
" mov x0, "INDEX" \n"
" b .Lizamax_kernel_DONE \n"
".Lizamax_kernel_zero: \n"
" mov x0, xzr \n"
".Lizamax_kernel_DONE: \n"
" mov %[INDEX_], "INDEX" \n"
: [INDEX_] "=r" (index) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return index;
}
#if defined(SMP)
static int izamax_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*(BLASLONG *)result = izamax_compute(n, x, inc_x);
return 0;
}
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha[2];
#endif
BLASLONG max_index = 0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (inc_x == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
max_index = izamax_compute(n, x, inc_x);
} else {
BLASLONG i, width, cur_index;
int num_cpu;
int mode;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT max = -1.0;
#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_COMPLEX;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)izamax_thread_function, nthreads);
num_cpu = 0;
i = n;
cur_index = 0;
while (i > 0) {
FLOAT elem_r, elem_i;
BLASLONG cur_max_index;
cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2];
elem_r = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 0];
elem_i = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 1];
elem_r = fabs(elem_r) + fabs(elem_i);
if (elem_r >= max) {
max = elem_r;
max_index = cur_index + cur_max_index;
}
width = blas_quickdivide(i + nthreads - num_cpu - 1,
nthreads - num_cpu);
i -= width;
cur_index += width;
num_cpu ++;
}
}
#else
max_index = izamax_compute(n, x, inc_x);
#endif
return max_index;
}

View File

@ -0,0 +1,355 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
#if !defined(COMPLEX)
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define J "x5" /* loop variable */
#define TMPF "s16"
#define TMPFD "d17"
#define SSQD "d0"
#define N_DIV_SHIFT "6"
#define N_REM_MASK "63"
#define INC_SHIFT "2"
#define KERNEL_F1 \
"ldr "TMPF", ["X"], #4 \n" \
"fcvt "TMPFD", "TMPF" \n" \
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
#define KERNEL_F \
KERNEL_F32 \
KERNEL_F32
#define KERNEL_F32 \
"ldur q16, ["X"] \n" \
"ldur q18, ["X", #16] \n" \
"ldur q20, ["X", #32] \n" \
"ldur q22, ["X", #48] \n" \
"ldur q24, ["X", #64] \n" \
"ldur q26, ["X", #80] \n" \
"ldur q28, ["X", #96] \n" \
"ldur q30, ["X", #112] \n" \
"add "X", "X", #128 \n" \
"fcvtl2 v17.2d, v16.4s \n" \
"fcvtl v16.2d, v16.2s \n" \
"fcvtl2 v19.2d, v18.4s \n" \
"fcvtl v18.2d, v18.2s \n" \
"fcvtl2 v21.2d, v20.4s \n" \
"fcvtl v20.2d, v20.2s \n" \
"fcvtl2 v23.2d, v22.4s \n" \
"fcvtl v22.2d, v22.2s \n" \
"fcvtl2 v25.2d, v24.4s \n" \
"fcvtl v24.2d, v24.2s \n" \
"fcvtl2 v27.2d, v26.4s \n" \
"fcvtl v26.2d, v26.2s \n" \
"fcvtl2 v29.2d, v28.4s \n" \
"fcvtl v28.2d, v28.2s \n" \
"fcvtl2 v31.2d, v30.4s \n" \
"fcvtl v30.2d, v30.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n" \
"fmla v1.2d, v17.2d, v17.2d \n" \
"fmla v2.2d, v18.2d, v18.2d \n" \
"fmla v3.2d, v19.2d, v19.2d \n" \
"fmla v4.2d, v20.2d, v20.2d \n" \
"fmla v5.2d, v21.2d, v21.2d \n" \
"fmla v6.2d, v22.2d, v22.2d \n" \
"fmla v7.2d, v23.2d, v23.2d \n" \
"fmla v0.2d, v24.2d, v24.2d \n" \
"fmla v1.2d, v25.2d, v25.2d \n" \
"fmla v2.2d, v26.2d, v26.2d \n" \
"fmla v3.2d, v27.2d, v27.2d \n" \
"fmla v4.2d, v28.2d, v28.2d \n" \
"fmla v5.2d, v29.2d, v29.2d \n" \
"fmla v6.2d, v30.2d, v30.2d \n" \
"fmla v7.2d, v31.2d, v31.2d \n" \
"prfm PLDL1KEEP, ["X", #1024] \n" \
"prfm PLDL1KEEP, ["X", #1024+64] \n"
#define KERNEL_F_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
"fadd v6.2d, v6.2d, v7.2d \n" \
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n" \
"faddp "SSQD", v0.2d \n"
#define KERNEL_S1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"fcvt "TMPFD", "TMPF" \n" \
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
#define KERNEL_FINALIZE \
""
#else
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define J "x5" /* loop variable */
#define TMPF "d16"
#define SSQD "d0"
#define N_DIV_SHIFT "4"
#define N_REM_MASK "15"
#define INC_SHIFT "3"
#define KERNEL_F1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", #8 \n" \
"fcvtl v16.2d, v16.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n"
#define KERNEL_F \
"ldur q16, ["X"] \n" \
"ldur q18, ["X", #16] \n" \
"ldur q20, ["X", #32] \n" \
"ldur q22, ["X", #48] \n" \
"ldur q24, ["X", #64] \n" \
"ldur q26, ["X", #80] \n" \
"ldur q28, ["X", #96] \n" \
"ldur q30, ["X", #112] \n" \
"add "X", "X", #128 \n" \
"fcvtl2 v17.2d, v16.4s \n" \
"fcvtl v16.2d, v16.2s \n" \
"fcvtl2 v19.2d, v18.4s \n" \
"fcvtl v18.2d, v18.2s \n" \
"fcvtl2 v21.2d, v20.4s \n" \
"fcvtl v20.2d, v20.2s \n" \
"fcvtl2 v23.2d, v22.4s \n" \
"fcvtl v22.2d, v22.2s \n" \
"fcvtl2 v25.2d, v24.4s \n" \
"fcvtl v24.2d, v24.2s \n" \
"fcvtl2 v27.2d, v26.4s \n" \
"fcvtl v26.2d, v26.2s \n" \
"fcvtl2 v29.2d, v28.4s \n" \
"fcvtl v28.2d, v28.2s \n" \
"fcvtl2 v31.2d, v30.4s \n" \
"fcvtl v30.2d, v30.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n" \
"fmla v1.2d, v17.2d, v17.2d \n" \
"fmla v2.2d, v18.2d, v18.2d \n" \
"fmla v3.2d, v19.2d, v19.2d \n" \
"fmla v4.2d, v20.2d, v20.2d \n" \
"fmla v5.2d, v21.2d, v21.2d \n" \
"fmla v6.2d, v22.2d, v22.2d \n" \
"fmla v7.2d, v23.2d, v23.2d \n" \
"fmla v0.2d, v24.2d, v24.2d \n" \
"fmla v1.2d, v25.2d, v25.2d \n" \
"fmla v2.2d, v26.2d, v26.2d \n" \
"fmla v3.2d, v27.2d, v27.2d \n" \
"fmla v4.2d, v28.2d, v28.2d \n" \
"fmla v5.2d, v29.2d, v29.2d \n" \
"fmla v6.2d, v30.2d, v30.2d \n" \
"fmla v7.2d, v31.2d, v31.2d \n" \
"prfm PLDL1KEEP, ["X", #1024] \n" \
"prfm PLDL1KEEP, ["X", #1024+64] \n"
#define KERNEL_F_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
"fadd v6.2d, v6.2d, v7.2d \n" \
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n"
#define KERNEL_FINALIZE \
"faddp "SSQD", v0.2d \n"
#define KERNEL_S1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"fcvtl v16.2d, v16.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n"
#endif
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
double ret = 0.0 ;
if (n <= 0) return ret;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SSQD", xzr \n"
" fmov d1, xzr \n"
" fmov d2, xzr \n"
" fmov d3, xzr \n"
" fmov d4, xzr \n"
" fmov d5, xzr \n"
" fmov d6, xzr \n"
" fmov d7, xzr \n"
" cmp "N", xzr \n"
" ble .Lnrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble .Lnrm2_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne .Lnrm2_kernel_S_BEGIN \n"
".Lnrm2_kernel_F_BEGIN: \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq .Lnrm2_kernel_S_BEGIN \n"
" .align 5 \n"
".Lnrm2_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F \n"
" "KERNEL_F_FINALIZE" \n"
".Lnrm2_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n"
" ble .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_F10 \n"
" b .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Lnrm2_kernel_S1 \n"
".Lnrm2_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_S4 \n"
".Lnrm2_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne .Lnrm2_kernel_S10 \n"
".Lnrm2_kernel_L999: \n"
" "KERNEL_FINALIZE" \n"
" fmov %[RET_], "SSQD" \n"
: [RET_] "=r" (ret) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return ret;
}
#if defined(SMP)
static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3,
BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
{
*(double *)result = nrm2_compute(n, x, inc_x);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha[2];
#endif
FLOAT nrm2 = 0.0;
double nrm2_double = 0.0;
if (n <= 0 || inc_x <= 0) return 0.0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
nrm2_double = nrm2_compute(n, x, inc_x);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
double *ptr;
#if !defined(COMPLEX)
mode = BLAS_SINGLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)nrm2_thread_function, nthreads);
ptr = (double *)result;
for (i = 0; i < nthreads; i++) {
nrm2_double = nrm2_double + (*ptr);
ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
nrm2_double = nrm2_compute(n, x, inc_x);
#endif
nrm2 = sqrt(nrm2_double);
return nrm2;
}

View File

@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -29,60 +29,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define TMPF s6
#define SSQ s0
#define TMPVF {v6.s}[0]
#define SZ 4
/******************************************************************************/
.macro INIT_F1
ldr TMPF, [X], #SZ
fmul SSQ, TMPF, TMPF
.endm
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define TMPF0 s0
#define TMPF1 s1
#define INC_SHIFT 2
#define N_DIV_SHIFT 2
#define N_REM_MASK 3
#else
#define TMPF0 d0
#define TMPF1 d1
#define INC_SHIFT 3
#define N_DIV_SHIFT 1
#define N_REM_MASK 1
#endif
#else
#if !defined(DOUBLE)
#define TMPF0 d0
#define TMPF1 d1
#define INC_SHIFT 3
#define N_DIV_SHIFT 1
#define N_REM_MASK 1
#else
#define TMPF0 q0
#define TMPF1 q1
#define INC_SHIFT 4
#define N_DIV_SHIFT 0
#define N_REM_MASK 0
#endif
#endif
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
ldr TMPF0, [X]
ldr TMPF1, [Y]
str TMPF0, [Y]
str TMPF1, [X]
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro INIT_F4
ld1 {v1.4s}, [X], #16
fmul v1.4s, v1.4s, v1.4s
ext v2.16b, v1.16b, v1.16b, #8
fadd v2.2s, v1.2s, v2.2s
faddp SSQ, v2.2s
.macro KERNEL_F
ldr q0, [X]
ldr q1, [Y]
add X, X, #16
add Y, Y, #16
prfm PLDL1STRM, [X, #1024]
prfm PLDL1STRM, [Y, #1024]
str q0, [Y, #-16]
str q1, [X, #-16]
.endm
.macro KERNEL_F4
ld1 {v1.4s}, [X], #16
fmul v1.4s, v1.4s, v1.4s
ext v2.16b, v1.16b, v1.16b, #8
fadd v2.2s, v1.2s, v2.2s
faddp TMPF, v2.2s
fadd SSQ, SSQ, TMPF
.macro INIT
lsl INC_X, INC_X, #INC_SHIFT
lsl INC_Y, INC_Y, #INC_SHIFT
.endm
.macro INIT_S
lsl INC_X, INC_X, #2
ld1 TMPVF, [X], INC_X
fmul SSQ, TMPF, TMPF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
/*******************************************************************************
* End of macro definitions
@ -91,88 +104,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
ble .Lswap_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1
bne .Lswap_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
.Lswap_kernel_F_BEGIN:
INIT
asr I, N, #2
asr I, N, #N_DIV_SHIFT
cmp I, xzr
beq nrm2_kernel_F1_INIT
beq .Lswap_kernel_F1
INIT_F4
subs I, I, #1
beq nrm2_kernel_F1
.align 5
.Lswap_kernel_F:
nrm2_kernel_F4:
KERNEL_F4
KERNEL_F
subs I, I, #1
bne nrm2_kernel_F4
bne .Lswap_kernel_F
nrm2_kernel_F1:
.Lswap_kernel_F1:
ands I, N, #3
ble nrm2_kernel_L999
#if defined(DOUBLE) && defined(COMPLEX)
b .Lswap_kernel_L999
#else
ands I, N, #N_REM_MASK
ble .Lswap_kernel_L999
#endif
nrm2_kernel_F10:
.Lswap_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
bne .Lswap_kernel_F10
b nrm2_kernel_L999
b .Lswap_kernel_L999
nrm2_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b nrm2_kernel_F1
nrm2_kernel_S_BEGIN:
.Lswap_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble nrm2_kernel_L999
INIT
asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1
ble .Lswap_kernel_S1
nrm2_kernel_S4:
.Lswap_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_S4
bne .Lswap_kernel_S4
nrm2_kernel_S1:
.Lswap_kernel_S1:
ands I, N, #3
ble nrm2_kernel_L999
ble .Lswap_kernel_L999
nrm2_kernel_S10:
.Lswap_kernel_S10:
KERNEL_S1
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_S10
subs I, I, #1
bne .Lswap_kernel_S10
nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret
nrm2_kernel_zero:
fmov SSQ, wzr
.Lswap_kernel_L999:
mov w0, wzr
ret
EPILOGUE

View File

@ -0,0 +1,357 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define Y "x3" /* "Y" vector address */
#define INC_Y "x4" /* "Y" stride */
#define J "x5" /* loop variable */
#if !defined(DOUBLE)
#define REG0 "wzr"
#define DOTF "s0"
#define DOTI "s1"
#define INC_SHIFT "3"
#define N_DIV_SHIFT "4"
#define N_REM_MASK "15"
#else
#define REG0 "xzr"
#define DOTF "d0"
#define DOTI "d1"
#define INC_SHIFT "4"
#define N_DIV_SHIFT "3"
#define N_REM_MASK "7"
#endif
#if !defined(CONJ)
#define f_ii "fmls"
#define f_ir "fmla"
#define a_ii "fsub"
#define a_ir "fadd"
#else
#define f_ii "fmla"
#define f_ir "fmls"
#define a_ii "fadd"
#define a_ir "fsub"
#endif
#if !defined(DOUBLE)
#define KERNEL_F1 \
" ldr d16, ["X"] \n" \
" ldr d24, ["Y"] \n" \
" add "X", "X", "INC_X" \n" \
" add "Y", "Y", "INC_Y" \n" \
" ins v17.s[0], v16.s[1] \n" \
" fmla "DOTF", s16, v24.s[0] \n" \
" "f_ii" "DOTF", s17, v24.s[1] \n" \
" "f_ir" "DOTI", s17, v24.s[0] \n" \
" fmla "DOTI", s16, v24.s[1] \n"
#define KERNEL_F \
" ld2 {v16.4s, v17.4s}, ["X"] \n" \
" ld2 {v24.4s, v25.4s}, ["Y"] \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" ld2 {v18.4s, v19.4s}, ["X"] \n" \
" ld2 {v26.4s, v27.4s}, ["Y"] \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" fmla v0.4s, v16.4s, v24.4s \n" \
" fmla v1.4s, v17.4s, v25.4s \n" \
" fmla v2.4s, v16.4s, v25.4s \n" \
" fmla v3.4s, v17.4s, v24.4s \n" \
" ld2 {v20.4s, v21.4s}, ["X"] \n" \
" ld2 {v28.4s, v29.4s}, ["Y"] \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" fmla v4.4s, v18.4s, v26.4s \n" \
" fmla v5.4s, v19.4s, v27.4s \n" \
" fmla v6.4s, v18.4s, v27.4s \n" \
" fmla v7.4s, v19.4s, v26.4s \n" \
" ld2 {v22.4s, v23.4s}, ["X"] \n" \
" ld2 {v30.4s, v31.4s}, ["Y"] \n" \
" fmla v0.4s, v20.4s, v28.4s \n" \
" fmla v1.4s, v21.4s, v29.4s \n" \
" fmla v2.4s, v20.4s, v29.4s \n" \
" fmla v3.4s, v21.4s, v28.4s \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" PRFM PLDL1KEEP, ["X", #1024] \n" \
" PRFM PLDL1KEEP, ["Y", #1024] \n" \
" PRFM PLDL1KEEP, ["X", #1024+64] \n" \
" PRFM PLDL1KEEP, ["Y", #1024+64] \n" \
" fmla v4.4s, v22.4s, v30.4s \n" \
" fmla v5.4s, v23.4s, v31.4s \n" \
" fmla v6.4s, v22.4s, v31.4s \n" \
" fmla v7.4s, v23.4s, v30.4s \n"
#define KERNEL_F_FINALIZE \
" fadd v0.4s, v0.4s, v4.4s \n" \
" fadd v1.4s, v1.4s, v5.4s \n" \
" fadd v2.4s, v2.4s, v6.4s \n" \
" fadd v3.4s, v3.4s, v7.4s \n" \
" "a_ii" v0.4s, v0.4s, v1.4s \n" \
" "a_ir" v1.4s, v2.4s, v3.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n" \
" faddp v1.4s, v1.4s, v1.4s \n" \
" faddp v1.4s, v1.4s, v1.4s \n"
#else
#define KERNEL_F1 \
" ldr q16, ["X"] \n" \
" ldr q24, ["Y"] \n" \
" add "X", "X", "INC_X" \n" \
" add "Y", "Y", "INC_Y" \n" \
" ins v17.d[0], v16.d[1] \n" \
" fmla "DOTF", d16, v24.d[0] \n" \
" "f_ii" "DOTF", d17, v24.d[1] \n" \
" "f_ir" "DOTI", d17, v24.d[0] \n" \
" fmla "DOTI", d16, v24.d[1] \n"
#define KERNEL_F \
" ld2 {v16.2d, v17.2d}, ["X"] \n" \
" ld2 {v24.2d, v25.2d}, ["Y"] \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" ld2 {v18.2d, v19.2d}, ["X"] \n" \
" ld2 {v26.2d, v27.2d}, ["Y"] \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" fmla v0.2d, v16.2d, v24.2d \n" \
" fmla v1.2d, v17.2d, v25.2d \n" \
" fmla v2.2d, v16.2d, v25.2d \n" \
" fmla v3.2d, v17.2d, v24.2d \n" \
" ld2 {v20.2d, v21.2d}, ["X"] \n" \
" ld2 {v28.2d, v29.2d}, ["Y"] \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" fmla v4.2d, v18.2d, v26.2d \n" \
" fmla v5.2d, v19.2d, v27.2d \n" \
" fmla v6.2d, v18.2d, v27.2d \n" \
" fmla v7.2d, v19.2d, v26.2d \n" \
" ld2 {v22.2d, v23.2d}, ["X"] \n" \
" ld2 {v30.2d, v31.2d}, ["Y"] \n" \
" fmla v0.2d, v20.2d, v28.2d \n" \
" fmla v1.2d, v21.2d, v29.2d \n" \
" fmla v2.2d, v20.2d, v29.2d \n" \
" fmla v3.2d, v21.2d, v28.2d \n" \
" add "X", "X", #32 \n" \
" add "Y", "Y", #32 \n" \
" PRFM PLDL1KEEP, ["X", #1024] \n" \
" PRFM PLDL1KEEP, ["Y", #1024] \n" \
" PRFM PLDL1KEEP, ["X", #1024+64] \n" \
" PRFM PLDL1KEEP, ["Y", #1024+64] \n" \
" fmla v4.2d, v22.2d, v30.2d \n" \
" fmla v5.2d, v23.2d, v31.2d \n" \
" fmla v6.2d, v22.2d, v31.2d \n" \
" fmla v7.2d, v23.2d, v30.2d \n"
#define KERNEL_F_FINALIZE \
" fadd v0.2d, v0.2d, v4.2d \n" \
" fadd v1.2d, v1.2d, v5.2d \n" \
" fadd v2.2d, v2.2d, v6.2d \n" \
" fadd v3.2d, v3.2d, v7.2d \n" \
" "a_ii" v0.2d, v0.2d, v1.2d \n" \
" "a_ir" v1.2d, v2.2d, v3.2d \n" \
" faddp "DOTF", v0.2d \n" \
" faddp "DOTI", v1.2d \n"
#endif
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result)
{
FLOAT dotr = 0.0, doti = 0.0;
CREAL(*result) = 0.0;
CIMAG(*result) = 0.0;
if ( n < 0 ) return;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" mov "Y", %[Y_] \n"
" mov "INC_Y", %[INCY_] \n"
" fmov "DOTF", "REG0" \n"
" fmov "DOTI", "REG0" \n"
" fmov d2, xzr \n"
" fmov d3, xzr \n"
" fmov d4, xzr \n"
" fmov d5, xzr \n"
" fmov d6, xzr \n"
" fmov d7, xzr \n"
" cmp "N", xzr \n"
" ble .Ldot_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n"
" bne .Ldot_kernel_S_BEGIN \n"
".Ldot_kernel_F_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq .Ldot_kernel_F1 \n"
" .align 5 \n"
".Ldot_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_F \n"
" "KERNEL_F_FINALIZE" \n"
".Ldot_kernel_F1: \n"
" ands "J", "N", #"N_REM_MASK" \n"
" ble .Ldot_kernel_L999 \n"
".Ldot_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_F10 \n"
" b .Ldot_kernel_L999 \n"
".Ldot_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble .Ldot_kernel_S1 \n"
".Ldot_kernel_S4: \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_S4 \n"
".Ldot_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble .Ldot_kernel_L999 \n"
".Ldot_kernel_S10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne .Ldot_kernel_S10 \n"
".Ldot_kernel_L999: \n"
" str "DOTF", [%[DOTR_]] \n"
" str "DOTI", [%[DOTI_]] \n"
:
: [DOTR_] "r" (&dotr), //%0
[DOTI_] "r" (&doti), //%1
[N_] "r" (n), //%2
[X_] "r" (x), //%3
[INCX_] "r" (inc_x), //%4
[Y_] "r" (y), //%5
[INCY_] "r" (inc_y) //%6
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
CREAL(*result) = dotr;
CIMAG(*result) = doti;
return;
}
#if defined(SMP)
static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
return 0;
}
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
OPENBLAS_COMPLEX_FLOAT zdot;
CREAL(zdot) = 0.0;
CIMAG(zdot) = 0.0;
#if defined(SMP)
nthreads = num_cpu_avail(1);
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
OPENBLAS_COMPLEX_FLOAT *ptr;
#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_COMPLEX;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)zdot_thread_function, nthreads);
ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
for (i = 0; i < nthreads; i++) {
CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
#endif
return zdot;
}

File diff suppressed because it is too large Load Diff

View File

@ -2447,17 +2447,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P sgemm_p
#define DGEMM_DEFAULT_P dgemm_p
#define CGEMM_DEFAULT_P cgemm_p
#define ZGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P zgemm_p
#define SGEMM_DEFAULT_Q sgemm_q
#define DGEMM_DEFAULT_Q dgemm_q
#define CGEMM_DEFAULT_Q cgemm_q
#define ZGEMM_DEFAULT_Q 512
#define ZGEMM_DEFAULT_Q zgemm_q
#define SGEMM_DEFAULT_R sgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R 2048
#define ZGEMM_DEFAULT_R zgemm_r
#define SYMV_P 16
#endif