THUNDERX2T99: Remove Duplicate Code
This commit is contained in:
parent
2757b49767
commit
e0dc5f58c5
|
@ -1,19 +1,11 @@
|
||||||
include $(KERNELDIR)/KERNEL.CORTEXA57
|
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||||
|
|
||||||
ifndef SMP
|
SNRM2KERNEL = snrm2_thunderx2t99.c
|
||||||
SNRM2KERNEL = snrm2_thunderx2t99.S
|
|
||||||
else
|
|
||||||
SNRM2KERNEL = nrm2_thunderx2t99_threaded.c
|
|
||||||
endif
|
|
||||||
CNRM2KERNEL = cnrm2_thunderx2t99.S
|
CNRM2KERNEL = cnrm2_thunderx2t99.S
|
||||||
|
|
||||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
|
|
||||||
ifndef SMP
|
DDOTKERNEL = ddot_thunderx2t99.c
|
||||||
DDOTKERNEL = ddot_thunderx2t99.S
|
|
||||||
else
|
|
||||||
DDOTKERNEL = ddot_thunderx2t99_threaded.c
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||||
|
|
|
@ -1,207 +0,0 @@
|
||||||
/*******************************************************************************
|
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
|
||||||
All rights reserved.
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
1. Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in
|
|
||||||
the documentation and/or other materials provided with the
|
|
||||||
distribution.
|
|
||||||
3. Neither the name of the OpenBLAS project nor the names of
|
|
||||||
its contributors may be used to endorse or promote products
|
|
||||||
derived from this software without specific prior written permission.
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
||||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*******************************************************************************/
|
|
||||||
|
|
||||||
#define ASSEMBLER
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#define N x0 /* vector length */
|
|
||||||
#define X x1 /* X vector address */
|
|
||||||
#define INC_X x2 /* X stride */
|
|
||||||
#define Y x3 /* Y vector address */
|
|
||||||
#define INC_Y x4 /* Y stride */
|
|
||||||
#define I x5 /* loop variable */
|
|
||||||
|
|
||||||
/*******************************************************************************
|
|
||||||
* Macro definitions
|
|
||||||
*******************************************************************************/
|
|
||||||
|
|
||||||
#define REG0 xzr
|
|
||||||
#define DOTF d0
|
|
||||||
#define TMPX d16
|
|
||||||
#define LD1VX {v16.d}[0]
|
|
||||||
#define TMPY d24
|
|
||||||
#define LD1VY {v24.d}[0]
|
|
||||||
#define SZ 8
|
|
||||||
|
|
||||||
/******************************************************************************/
|
|
||||||
|
|
||||||
.macro KERNEL_F1
|
|
||||||
ldr TMPX, [X]
|
|
||||||
ldr TMPY, [Y]
|
|
||||||
add X, X, #SZ
|
|
||||||
add Y, Y, #SZ
|
|
||||||
fmadd DOTF, TMPX, TMPY, DOTF
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_F16
|
|
||||||
ldp q16, q17, [X]
|
|
||||||
ldp q24, q25, [Y]
|
|
||||||
|
|
||||||
ldp q18, q19, [X, #32]
|
|
||||||
ldp q26, q27, [Y, #32]
|
|
||||||
|
|
||||||
fmla v0.2d, v16.2d, v24.2d
|
|
||||||
fmla v1.2d, v17.2d, v25.2d
|
|
||||||
|
|
||||||
ldp q20, q21, [X, #64]
|
|
||||||
ldp q28, q29, [Y, #64]
|
|
||||||
|
|
||||||
fmla v2.2d, v18.2d, v26.2d
|
|
||||||
fmla v3.2d, v19.2d, v27.2d
|
|
||||||
|
|
||||||
ldp q22, q23, [X, #96]
|
|
||||||
ldp q30, q31, [Y, #96]
|
|
||||||
|
|
||||||
add Y, Y, #128
|
|
||||||
add X, X, #128
|
|
||||||
|
|
||||||
fmla v4.2d, v20.2d, v28.2d
|
|
||||||
fmla v5.2d, v21.2d, v29.2d
|
|
||||||
|
|
||||||
PRFM PLDL1KEEP, [X, #896]
|
|
||||||
PRFM PLDL1KEEP, [Y, #896]
|
|
||||||
PRFM PLDL1KEEP, [X, #896+64]
|
|
||||||
PRFM PLDL1KEEP, [Y, #896+64]
|
|
||||||
|
|
||||||
fmla v6.2d, v22.2d, v30.2d
|
|
||||||
fmla v7.2d, v23.2d, v31.2d
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_F32
|
|
||||||
KERNEL_F16
|
|
||||||
KERNEL_F16
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_F32_FINALIZE
|
|
||||||
fadd v0.2d, v0.2d, v1.2d
|
|
||||||
fadd v2.2d, v2.2d, v3.2d
|
|
||||||
fadd v4.2d, v4.2d, v5.2d
|
|
||||||
fadd v6.2d, v6.2d, v7.2d
|
|
||||||
fadd v0.2d, v0.2d, v2.2d
|
|
||||||
fadd v4.2d, v4.2d, v6.2d
|
|
||||||
fadd v0.2d, v0.2d, v4.2d
|
|
||||||
faddp DOTF, v0.2d
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro INIT_S
|
|
||||||
lsl INC_X, INC_X, #3
|
|
||||||
lsl INC_Y, INC_Y, #3
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_S1
|
|
||||||
ld1 LD1VX, [X], INC_X
|
|
||||||
ld1 LD1VY, [Y], INC_Y
|
|
||||||
fmadd DOTF, TMPX, TMPY, DOTF
|
|
||||||
.endm
|
|
||||||
|
|
||||||
/*******************************************************************************
|
|
||||||
* End of macro definitions
|
|
||||||
*******************************************************************************/
|
|
||||||
|
|
||||||
PROLOGUE
|
|
||||||
|
|
||||||
fmov DOTF, REG0
|
|
||||||
fmov d1, REG0
|
|
||||||
fmov d2, REG0
|
|
||||||
fmov d3, REG0
|
|
||||||
fmov d4, REG0
|
|
||||||
fmov d5, REG0
|
|
||||||
fmov d6, REG0
|
|
||||||
fmov d7, REG0
|
|
||||||
|
|
||||||
cmp N, xzr
|
|
||||||
ble dot_kernel_L999
|
|
||||||
|
|
||||||
cmp INC_X, #1
|
|
||||||
bne dot_kernel_S_BEGIN
|
|
||||||
cmp INC_Y, #1
|
|
||||||
bne dot_kernel_S_BEGIN
|
|
||||||
|
|
||||||
dot_kernel_F_BEGIN:
|
|
||||||
|
|
||||||
asr I, N, #5
|
|
||||||
cmp I, xzr
|
|
||||||
beq dot_kernel_F1
|
|
||||||
|
|
||||||
dot_kernel_F32:
|
|
||||||
|
|
||||||
KERNEL_F32
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne dot_kernel_F32
|
|
||||||
|
|
||||||
KERNEL_F32_FINALIZE
|
|
||||||
|
|
||||||
dot_kernel_F1:
|
|
||||||
|
|
||||||
ands I, N, #31
|
|
||||||
ble dot_kernel_L999
|
|
||||||
|
|
||||||
dot_kernel_F10:
|
|
||||||
|
|
||||||
KERNEL_F1
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne dot_kernel_F10
|
|
||||||
|
|
||||||
ret
|
|
||||||
|
|
||||||
dot_kernel_S_BEGIN:
|
|
||||||
|
|
||||||
INIT_S
|
|
||||||
|
|
||||||
asr I, N, #2
|
|
||||||
cmp I, xzr
|
|
||||||
ble dot_kernel_S1
|
|
||||||
|
|
||||||
dot_kernel_S4:
|
|
||||||
|
|
||||||
KERNEL_S1
|
|
||||||
KERNEL_S1
|
|
||||||
KERNEL_S1
|
|
||||||
KERNEL_S1
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne dot_kernel_S4
|
|
||||||
|
|
||||||
dot_kernel_S1:
|
|
||||||
|
|
||||||
ands I, N, #3
|
|
||||||
ble dot_kernel_L999
|
|
||||||
|
|
||||||
dot_kernel_S10:
|
|
||||||
|
|
||||||
KERNEL_S1
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne dot_kernel_S10
|
|
||||||
|
|
||||||
dot_kernel_L999:
|
|
||||||
|
|
||||||
ret
|
|
||||||
|
|
||||||
EPILOGUE
|
|
|
@ -45,9 +45,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define LD1VY "{v24.d}[0]"
|
#define LD1VY "{v24.d}[0]"
|
||||||
#define SZ "8"
|
#define SZ "8"
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static FLOAT ddot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
static FLOAT ddot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
@ -211,6 +213,7 @@ static FLOAT ddot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO
|
||||||
return(dot);
|
return(dot);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
static int ddot_thread_function(BLASLONG n, BLASLONG dummy0,
|
static int ddot_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||||
|
@ -219,13 +222,17 @@ static int ddot_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
{
|
{
|
||||||
|
#if defined(SMP)
|
||||||
int nthreads;
|
int nthreads;
|
||||||
FLOAT dot = 0.0;
|
|
||||||
FLOAT dummy_alpha;
|
FLOAT dummy_alpha;
|
||||||
|
#endif
|
||||||
|
FLOAT dot = 0.0;
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
if (inc_x == 0 || inc_y == 0)
|
||||||
|
@ -253,6 +260,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
|
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
dot = ddot_compute(n, x, inc_x, y, inc_y);
|
||||||
|
#endif
|
||||||
|
|
||||||
return dot;
|
return dot;
|
||||||
}
|
}
|
|
@ -1,228 +0,0 @@
|
||||||
/*******************************************************************************
|
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
|
||||||
All rights reserved.
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
1. Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in
|
|
||||||
the documentation and/or other materials provided with the
|
|
||||||
distribution.
|
|
||||||
3. Neither the name of the OpenBLAS project nor the names of
|
|
||||||
its contributors may be used to endorse or promote products
|
|
||||||
derived from this software without specific prior written permission.
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
||||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*******************************************************************************/
|
|
||||||
|
|
||||||
#define ASSEMBLER
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#define N x0 /* vector length */
|
|
||||||
#define X x1 /* X vector address */
|
|
||||||
#define INC_X x2 /* X stride */
|
|
||||||
#define I x5 /* loop variable */
|
|
||||||
|
|
||||||
/*******************************************************************************
|
|
||||||
* Macro definitions
|
|
||||||
*******************************************************************************/
|
|
||||||
|
|
||||||
#define TMPF s16
|
|
||||||
#define TMPFD d17
|
|
||||||
#define SSQ s0
|
|
||||||
#define SSQD d0
|
|
||||||
#define TMPVF {v16.s}[0]
|
|
||||||
#define TMPVFD {v17.s}[0]
|
|
||||||
#define SZ 4
|
|
||||||
|
|
||||||
/******************************************************************************/
|
|
||||||
|
|
||||||
.macro INIT
|
|
||||||
fmov SSQD, xzr
|
|
||||||
fmov d1, xzr
|
|
||||||
fmov d2, xzr
|
|
||||||
fmov d3, xzr
|
|
||||||
fmov d4, xzr
|
|
||||||
fmov d5, xzr
|
|
||||||
fmov d6, xzr
|
|
||||||
fmov d7, xzr
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_F1
|
|
||||||
ldr TMPF, [X], #SZ
|
|
||||||
fcvt TMPFD, TMPF
|
|
||||||
fmadd SSQD, TMPFD, TMPFD, SSQD
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_F32
|
|
||||||
ldur q16, [X]
|
|
||||||
ldur q18, [X, #16]
|
|
||||||
ldur q20, [X, #32]
|
|
||||||
ldur q22, [X, #48]
|
|
||||||
ldur q24, [X, #64]
|
|
||||||
ldur q26, [X, #80]
|
|
||||||
ldur q28, [X, #96]
|
|
||||||
ldur q30, [X, #112]
|
|
||||||
|
|
||||||
add X, X, #128
|
|
||||||
|
|
||||||
fcvtl2 v17.2d, v16.4s
|
|
||||||
fcvtl v16.2d, v16.2s
|
|
||||||
fcvtl2 v19.2d, v18.4s
|
|
||||||
fcvtl v18.2d, v18.2s
|
|
||||||
fcvtl2 v21.2d, v20.4s
|
|
||||||
fcvtl v20.2d, v20.2s
|
|
||||||
fcvtl2 v23.2d, v22.4s
|
|
||||||
fcvtl v22.2d, v22.2s
|
|
||||||
fcvtl2 v25.2d, v24.4s
|
|
||||||
fcvtl v24.2d, v24.2s
|
|
||||||
fcvtl2 v27.2d, v26.4s
|
|
||||||
fcvtl v26.2d, v26.2s
|
|
||||||
fcvtl2 v29.2d, v28.4s
|
|
||||||
fcvtl v28.2d, v28.2s
|
|
||||||
fcvtl2 v31.2d, v30.4s
|
|
||||||
fcvtl v30.2d, v30.2s
|
|
||||||
|
|
||||||
fmla v0.2d, v16.2d, v16.2d
|
|
||||||
fmla v1.2d, v17.2d, v17.2d
|
|
||||||
fmla v2.2d, v18.2d, v18.2d
|
|
||||||
fmla v3.2d, v19.2d, v19.2d
|
|
||||||
fmla v4.2d, v20.2d, v20.2d
|
|
||||||
fmla v5.2d, v21.2d, v21.2d
|
|
||||||
fmla v6.2d, v22.2d, v22.2d
|
|
||||||
fmla v7.2d, v23.2d, v23.2d
|
|
||||||
|
|
||||||
fmla v0.2d, v24.2d, v24.2d
|
|
||||||
fmla v1.2d, v25.2d, v25.2d
|
|
||||||
fmla v2.2d, v26.2d, v26.2d
|
|
||||||
fmla v3.2d, v27.2d, v27.2d
|
|
||||||
fmla v4.2d, v28.2d, v28.2d
|
|
||||||
fmla v5.2d, v29.2d, v29.2d
|
|
||||||
fmla v6.2d, v30.2d, v30.2d
|
|
||||||
fmla v7.2d, v31.2d, v31.2d
|
|
||||||
|
|
||||||
prfm PLDL1KEEP, [X, #1024]
|
|
||||||
prfm PLDL1KEEP, [X, #1024+64]
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_F32_FINALIZE
|
|
||||||
fadd v0.2d, v0.2d, v1.2d
|
|
||||||
fadd v2.2d, v2.2d, v3.2d
|
|
||||||
fadd v4.2d, v4.2d, v5.2d
|
|
||||||
fadd v6.2d, v6.2d, v7.2d
|
|
||||||
|
|
||||||
fadd v0.2d, v0.2d, v2.2d
|
|
||||||
fadd v4.2d, v4.2d, v6.2d
|
|
||||||
|
|
||||||
fadd v0.2d, v0.2d, v4.2d
|
|
||||||
faddp SSQD, v0.2d
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro INIT_S
|
|
||||||
lsl INC_X, INC_X, #2
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL_S1
|
|
||||||
ldr TMPF, [X]
|
|
||||||
add X, X, INC_X
|
|
||||||
fcvt TMPFD, TMPF
|
|
||||||
fmadd SSQD, TMPFD, TMPFD, SSQD
|
|
||||||
.endm
|
|
||||||
|
|
||||||
/*******************************************************************************
|
|
||||||
* End of macro definitions
|
|
||||||
*******************************************************************************/
|
|
||||||
|
|
||||||
PROLOGUE
|
|
||||||
|
|
||||||
INIT
|
|
||||||
|
|
||||||
cmp N, xzr
|
|
||||||
ble nrm2_kernel_zero
|
|
||||||
cmp INC_X, xzr
|
|
||||||
ble nrm2_kernel_zero
|
|
||||||
cmp INC_X, #1
|
|
||||||
bne nrm2_kernel_S_BEGIN
|
|
||||||
|
|
||||||
nrm2_kernel_F_BEGIN:
|
|
||||||
|
|
||||||
asr I, N, #6
|
|
||||||
cmp I, xzr
|
|
||||||
beq nrm2_kernel_S_BEGIN
|
|
||||||
|
|
||||||
.align 5
|
|
||||||
nrm2_kernel_F64:
|
|
||||||
|
|
||||||
KERNEL_F32
|
|
||||||
KERNEL_F32
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne nrm2_kernel_F64
|
|
||||||
|
|
||||||
KERNEL_F32_FINALIZE
|
|
||||||
|
|
||||||
nrm2_kernel_F1:
|
|
||||||
|
|
||||||
ands I, N, #63
|
|
||||||
ble nrm2_kernel_L999
|
|
||||||
|
|
||||||
nrm2_kernel_F10:
|
|
||||||
|
|
||||||
KERNEL_F1
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne nrm2_kernel_F10
|
|
||||||
|
|
||||||
b nrm2_kernel_L999
|
|
||||||
|
|
||||||
nrm2_kernel_S_BEGIN:
|
|
||||||
|
|
||||||
INIT_S
|
|
||||||
|
|
||||||
asr I, N, #2
|
|
||||||
cmp I, xzr
|
|
||||||
ble nrm2_kernel_S1
|
|
||||||
|
|
||||||
nrm2_kernel_S4:
|
|
||||||
|
|
||||||
KERNEL_S1
|
|
||||||
KERNEL_S1
|
|
||||||
KERNEL_S1
|
|
||||||
KERNEL_S1
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne nrm2_kernel_S4
|
|
||||||
|
|
||||||
nrm2_kernel_S1:
|
|
||||||
|
|
||||||
ands I, N, #3
|
|
||||||
ble nrm2_kernel_L999
|
|
||||||
|
|
||||||
nrm2_kernel_S10:
|
|
||||||
|
|
||||||
KERNEL_S1
|
|
||||||
|
|
||||||
subs I, I, #1
|
|
||||||
bne nrm2_kernel_S10
|
|
||||||
|
|
||||||
nrm2_kernel_L999:
|
|
||||||
fsqrt SSQD, SSQD
|
|
||||||
fcvt SSQ, SSQD
|
|
||||||
ret
|
|
||||||
|
|
||||||
nrm2_kernel_zero:
|
|
||||||
fmov SSQ, wzr
|
|
||||||
|
|
||||||
ret
|
|
||||||
|
|
||||||
EPILOGUE
|
|
|
@ -30,11 +30,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(DOUBLE)
|
|
||||||
#define N "x0" /* vector length */
|
#define N "x0" /* vector length */
|
||||||
#define X "x1" /* X vector address */
|
#define X "x1" /* X vector address */
|
||||||
#define INC_X "x2" /* X stride */
|
#define INC_X "x2" /* X stride */
|
||||||
|
@ -197,9 +198,8 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#else //!defined(DOUBLE)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
|
static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3,
|
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3,
|
||||||
BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
|
BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
|
||||||
|
@ -208,17 +208,21 @@ static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
{
|
{
|
||||||
|
#if defined(SMP)
|
||||||
int nthreads;
|
int nthreads;
|
||||||
|
FLOAT dummy_alpha;
|
||||||
|
#endif
|
||||||
FLOAT nrm2 = 0.0;
|
FLOAT nrm2 = 0.0;
|
||||||
double nrm2_double = 0.0;
|
double nrm2_double = 0.0;
|
||||||
FLOAT dummy_alpha;
|
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
if (n == 1) return fabs(x[0]);
|
if (n == 1) return fabs(x[0]);
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
|
@ -243,6 +247,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
|
ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
nrm2_double = nrm2_compute(n, x, inc_x);
|
||||||
|
#endif
|
||||||
nrm2 = sqrt(nrm2_double);
|
nrm2 = sqrt(nrm2_double);
|
||||||
|
|
||||||
return nrm2;
|
return nrm2;
|
Loading…
Reference in New Issue