THUNDERX2T99: Add parallel SCNRM2 Implementation
This commit is contained in:
parent
8e89668f62
commit
228c75a69c
|
@ -19,8 +19,8 @@ ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
|
||||||
|
|
||||||
SNRM2KERNEL = snrm2_thunderx2t99.c
|
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
CNRM2KERNEL = cnrm2_thunderx2t99.S
|
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
|
||||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
|
|
||||||
|
|
|
@ -36,6 +36,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
||||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(COMPLEX)
|
||||||
#define N "x0" /* vector length */
|
#define N "x0" /* vector length */
|
||||||
#define X "x1" /* X vector address */
|
#define X "x1" /* X vector address */
|
||||||
#define INC_X "x2" /* X stride */
|
#define INC_X "x2" /* X stride */
|
||||||
|
@ -45,11 +46,19 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
||||||
#define TMPFD "d17"
|
#define TMPFD "d17"
|
||||||
#define SSQD "d0"
|
#define SSQD "d0"
|
||||||
|
|
||||||
|
#define N_DIV_SHIFT "6"
|
||||||
|
#define N_REM_MASK "63"
|
||||||
|
#define INC_SHIFT "2"
|
||||||
|
|
||||||
#define KERNEL_F1 \
|
#define KERNEL_F1 \
|
||||||
"ldr "TMPF", ["X"], #4 \n" \
|
"ldr "TMPF", ["X"], #4 \n" \
|
||||||
"fcvt "TMPFD", "TMPF" \n" \
|
"fcvt "TMPFD", "TMPF" \n" \
|
||||||
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
|
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
|
||||||
|
|
||||||
|
#define KERNEL_F \
|
||||||
|
KERNEL_F32 \
|
||||||
|
KERNEL_F32
|
||||||
|
|
||||||
#define KERNEL_F32 \
|
#define KERNEL_F32 \
|
||||||
"ldur q16, ["X"] \n" \
|
"ldur q16, ["X"] \n" \
|
||||||
"ldur q18, ["X", #16] \n" \
|
"ldur q18, ["X", #16] \n" \
|
||||||
|
@ -95,7 +104,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
||||||
"prfm PLDL1KEEP, ["X", #1024] \n" \
|
"prfm PLDL1KEEP, ["X", #1024] \n" \
|
||||||
"prfm PLDL1KEEP, ["X", #1024+64] \n"
|
"prfm PLDL1KEEP, ["X", #1024+64] \n"
|
||||||
|
|
||||||
#define KERNEL_F32_FINALIZE \
|
#define KERNEL_F_FINALIZE \
|
||||||
"fadd v0.2d, v0.2d, v1.2d \n" \
|
"fadd v0.2d, v0.2d, v1.2d \n" \
|
||||||
"fadd v2.2d, v2.2d, v3.2d \n" \
|
"fadd v2.2d, v2.2d, v3.2d \n" \
|
||||||
"fadd v4.2d, v4.2d, v5.2d \n" \
|
"fadd v4.2d, v4.2d, v5.2d \n" \
|
||||||
|
@ -111,6 +120,93 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
||||||
"fcvt "TMPFD", "TMPF" \n" \
|
"fcvt "TMPFD", "TMPF" \n" \
|
||||||
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
|
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
|
||||||
|
|
||||||
|
#define KERNEL_FINALIZE \
|
||||||
|
""
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define N "x0" /* vector length */
|
||||||
|
#define X "x1" /* X vector address */
|
||||||
|
#define INC_X "x2" /* X stride */
|
||||||
|
#define I "x5" /* loop variable */
|
||||||
|
|
||||||
|
#define TMPF "d16"
|
||||||
|
#define SSQD "d0"
|
||||||
|
|
||||||
|
#define N_DIV_SHIFT "4"
|
||||||
|
#define N_REM_MASK "15"
|
||||||
|
#define INC_SHIFT "3"
|
||||||
|
|
||||||
|
#define KERNEL_F1 \
|
||||||
|
"ldr "TMPF", ["X"] \n" \
|
||||||
|
"add "X", "X", #8 \n" \
|
||||||
|
"fcvtl v16.2d, v16.2s \n" \
|
||||||
|
"fmla v0.2d, v16.2d, v16.2d \n"
|
||||||
|
|
||||||
|
#define KERNEL_F \
|
||||||
|
"ldur q16, ["X"] \n" \
|
||||||
|
"ldur q18, ["X", #16] \n" \
|
||||||
|
"ldur q20, ["X", #32] \n" \
|
||||||
|
"ldur q22, ["X", #48] \n" \
|
||||||
|
"ldur q24, ["X", #64] \n" \
|
||||||
|
"ldur q26, ["X", #80] \n" \
|
||||||
|
"ldur q28, ["X", #96] \n" \
|
||||||
|
"ldur q30, ["X", #112] \n" \
|
||||||
|
"add "X", "X", #128 \n" \
|
||||||
|
"fcvtl2 v17.2d, v16.4s \n" \
|
||||||
|
"fcvtl v16.2d, v16.2s \n" \
|
||||||
|
"fcvtl2 v19.2d, v18.4s \n" \
|
||||||
|
"fcvtl v18.2d, v18.2s \n" \
|
||||||
|
"fcvtl2 v21.2d, v20.4s \n" \
|
||||||
|
"fcvtl v20.2d, v20.2s \n" \
|
||||||
|
"fcvtl2 v23.2d, v22.4s \n" \
|
||||||
|
"fcvtl v22.2d, v22.2s \n" \
|
||||||
|
"fcvtl2 v25.2d, v24.4s \n" \
|
||||||
|
"fcvtl v24.2d, v24.2s \n" \
|
||||||
|
"fcvtl2 v27.2d, v26.4s \n" \
|
||||||
|
"fcvtl v26.2d, v26.2s \n" \
|
||||||
|
"fcvtl2 v29.2d, v28.4s \n" \
|
||||||
|
"fcvtl v28.2d, v28.2s \n" \
|
||||||
|
"fcvtl2 v31.2d, v30.4s \n" \
|
||||||
|
"fcvtl v30.2d, v30.2s \n" \
|
||||||
|
"fmla v0.2d, v16.2d, v16.2d \n" \
|
||||||
|
"fmla v1.2d, v17.2d, v17.2d \n" \
|
||||||
|
"fmla v2.2d, v18.2d, v18.2d \n" \
|
||||||
|
"fmla v3.2d, v19.2d, v19.2d \n" \
|
||||||
|
"fmla v4.2d, v20.2d, v20.2d \n" \
|
||||||
|
"fmla v5.2d, v21.2d, v21.2d \n" \
|
||||||
|
"fmla v6.2d, v22.2d, v22.2d \n" \
|
||||||
|
"fmla v7.2d, v23.2d, v23.2d \n" \
|
||||||
|
"fmla v0.2d, v24.2d, v24.2d \n" \
|
||||||
|
"fmla v1.2d, v25.2d, v25.2d \n" \
|
||||||
|
"fmla v2.2d, v26.2d, v26.2d \n" \
|
||||||
|
"fmla v3.2d, v27.2d, v27.2d \n" \
|
||||||
|
"fmla v4.2d, v28.2d, v28.2d \n" \
|
||||||
|
"fmla v5.2d, v29.2d, v29.2d \n" \
|
||||||
|
"fmla v6.2d, v30.2d, v30.2d \n" \
|
||||||
|
"fmla v7.2d, v31.2d, v31.2d \n" \
|
||||||
|
"prfm PLDL1KEEP, ["X", #1024] \n" \
|
||||||
|
"prfm PLDL1KEEP, ["X", #1024+64] \n"
|
||||||
|
|
||||||
|
#define KERNEL_F_FINALIZE \
|
||||||
|
"fadd v0.2d, v0.2d, v1.2d \n" \
|
||||||
|
"fadd v2.2d, v2.2d, v3.2d \n" \
|
||||||
|
"fadd v4.2d, v4.2d, v5.2d \n" \
|
||||||
|
"fadd v6.2d, v6.2d, v7.2d \n" \
|
||||||
|
"fadd v0.2d, v0.2d, v2.2d \n" \
|
||||||
|
"fadd v4.2d, v4.2d, v6.2d \n" \
|
||||||
|
"fadd v0.2d, v0.2d, v4.2d \n"
|
||||||
|
|
||||||
|
#define KERNEL_FINALIZE \
|
||||||
|
"faddp "SSQD", v0.2d \n"
|
||||||
|
|
||||||
|
#define KERNEL_S1 \
|
||||||
|
"ldr "TMPF", ["X"] \n" \
|
||||||
|
"add "X", "X", "INC_X" \n" \
|
||||||
|
"fcvtl v16.2d, v16.2s \n" \
|
||||||
|
"fmla v0.2d, v16.2d, v16.2d \n"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
{
|
{
|
||||||
|
@ -138,20 +234,19 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" bne .Lnrm2_kernel_S_BEGIN \n"
|
" bne .Lnrm2_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F_BEGIN: \n"
|
".Lnrm2_kernel_F_BEGIN: \n"
|
||||||
" asr "I", "N", #6 \n"
|
" asr "I", "N", #"N_DIV_SHIFT" \n"
|
||||||
" cmp "I", xzr \n"
|
" cmp "I", xzr \n"
|
||||||
" beq .Lnrm2_kernel_S_BEGIN \n"
|
" beq .Lnrm2_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
" .align 5 \n"
|
" .align 5 \n"
|
||||||
".Lnrm2_kernel_F64: \n"
|
".Lnrm2_kernel_F: \n"
|
||||||
" "KERNEL_F32" \n"
|
" "KERNEL_F" \n"
|
||||||
" "KERNEL_F32" \n"
|
|
||||||
" subs "I", "I", #1 \n"
|
" subs "I", "I", #1 \n"
|
||||||
" bne .Lnrm2_kernel_F64 \n"
|
" bne .Lnrm2_kernel_F \n"
|
||||||
" "KERNEL_F32_FINALIZE" \n"
|
" "KERNEL_F_FINALIZE" \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F1: \n"
|
".Lnrm2_kernel_F1: \n"
|
||||||
" ands "I", "N", #63 \n"
|
" ands "I", "N", #"N_REM_MASK" \n"
|
||||||
" ble .Lnrm2_kernel_L999 \n"
|
" ble .Lnrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_F10: \n"
|
".Lnrm2_kernel_F10: \n"
|
||||||
|
@ -161,7 +256,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" b .Lnrm2_kernel_L999 \n"
|
" b .Lnrm2_kernel_L999 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_S_BEGIN: \n"
|
".Lnrm2_kernel_S_BEGIN: \n"
|
||||||
" lsl "INC_X", "INC_X", #2 \n"
|
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
||||||
" asr "I", "N", #2 \n"
|
" asr "I", "N", #2 \n"
|
||||||
" cmp "I", xzr \n"
|
" cmp "I", xzr \n"
|
||||||
" ble .Lnrm2_kernel_S1 \n"
|
" ble .Lnrm2_kernel_S1 \n"
|
||||||
|
@ -184,6 +279,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" bne .Lnrm2_kernel_S10 \n"
|
" bne .Lnrm2_kernel_S10 \n"
|
||||||
|
|
||||||
".Lnrm2_kernel_L999: \n"
|
".Lnrm2_kernel_L999: \n"
|
||||||
|
" "KERNEL_FINALIZE" \n"
|
||||||
" fmov %[RET_], "SSQD" \n"
|
" fmov %[RET_], "SSQD" \n"
|
||||||
|
|
||||||
: [RET_] "=r" (ret) //%0
|
: [RET_] "=r" (ret) //%0
|
||||||
|
@ -214,13 +310,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
{
|
{
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
int nthreads;
|
int nthreads;
|
||||||
FLOAT dummy_alpha;
|
FLOAT dummy_alpha[2];
|
||||||
#endif
|
#endif
|
||||||
FLOAT nrm2 = 0.0;
|
FLOAT nrm2 = 0.0;
|
||||||
double nrm2_double = 0.0;
|
double nrm2_double = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
if (n == 1) return fabs(x[0]);
|
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
nthreads = num_cpu_avail(1);
|
||||||
|
@ -235,7 +330,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||||
double *ptr;
|
double *ptr;
|
||||||
|
|
||||||
|
#if !defined(COMPLEX)
|
||||||
mode = BLAS_SINGLE | BLAS_REAL;
|
mode = BLAS_SINGLE | BLAS_REAL;
|
||||||
|
#else
|
||||||
|
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||||
|
#endif
|
||||||
|
|
||||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||||
x, inc_x, NULL, 0, result, 0,
|
x, inc_x, NULL, 0, result, 0,
|
Loading…
Reference in New Issue