THUNDERX2T99: Add parallel SCNRM2 Implementation
This commit is contained in:
parent
8e89668f62
commit
228c75a69c
|
@ -19,8 +19,8 @@ ISAMAXKERNEL = iamax_thunderx2t99.c
|
|||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
|
||||
|
||||
SNRM2KERNEL = snrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = cnrm2_thunderx2t99.S
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
|
||||
|
|
|
@ -36,6 +36,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
|||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#define N "x0" /* vector length */
|
||||
#define X "x1" /* X vector address */
|
||||
#define INC_X "x2" /* X stride */
|
||||
|
@ -45,11 +46,19 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
|||
#define TMPFD "d17"
|
||||
#define SSQD "d0"
|
||||
|
||||
#define N_DIV_SHIFT "6"
|
||||
#define N_REM_MASK "63"
|
||||
#define INC_SHIFT "2"
|
||||
|
||||
#define KERNEL_F1 \
|
||||
"ldr "TMPF", ["X"], #4 \n" \
|
||||
"fcvt "TMPFD", "TMPF" \n" \
|
||||
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
|
||||
|
||||
#define KERNEL_F \
|
||||
KERNEL_F32 \
|
||||
KERNEL_F32
|
||||
|
||||
#define KERNEL_F32 \
|
||||
"ldur q16, ["X"] \n" \
|
||||
"ldur q18, ["X", #16] \n" \
|
||||
|
@ -95,7 +104,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
|||
"prfm PLDL1KEEP, ["X", #1024] \n" \
|
||||
"prfm PLDL1KEEP, ["X", #1024+64] \n"
|
||||
|
||||
#define KERNEL_F32_FINALIZE \
|
||||
#define KERNEL_F_FINALIZE \
|
||||
"fadd v0.2d, v0.2d, v1.2d \n" \
|
||||
"fadd v2.2d, v2.2d, v3.2d \n" \
|
||||
"fadd v4.2d, v4.2d, v5.2d \n" \
|
||||
|
@ -111,6 +120,93 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
|||
"fcvt "TMPFD", "TMPF" \n" \
|
||||
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
|
||||
|
||||
#define KERNEL_FINALIZE \
|
||||
""
|
||||
|
||||
#else
|
||||
|
||||
#define N "x0" /* vector length */
|
||||
#define X "x1" /* X vector address */
|
||||
#define INC_X "x2" /* X stride */
|
||||
#define I "x5" /* loop variable */
|
||||
|
||||
#define TMPF "d16"
|
||||
#define SSQD "d0"
|
||||
|
||||
#define N_DIV_SHIFT "4"
|
||||
#define N_REM_MASK "15"
|
||||
#define INC_SHIFT "3"
|
||||
|
||||
#define KERNEL_F1 \
|
||||
"ldr "TMPF", ["X"] \n" \
|
||||
"add "X", "X", #8 \n" \
|
||||
"fcvtl v16.2d, v16.2s \n" \
|
||||
"fmla v0.2d, v16.2d, v16.2d \n"
|
||||
|
||||
#define KERNEL_F \
|
||||
"ldur q16, ["X"] \n" \
|
||||
"ldur q18, ["X", #16] \n" \
|
||||
"ldur q20, ["X", #32] \n" \
|
||||
"ldur q22, ["X", #48] \n" \
|
||||
"ldur q24, ["X", #64] \n" \
|
||||
"ldur q26, ["X", #80] \n" \
|
||||
"ldur q28, ["X", #96] \n" \
|
||||
"ldur q30, ["X", #112] \n" \
|
||||
"add "X", "X", #128 \n" \
|
||||
"fcvtl2 v17.2d, v16.4s \n" \
|
||||
"fcvtl v16.2d, v16.2s \n" \
|
||||
"fcvtl2 v19.2d, v18.4s \n" \
|
||||
"fcvtl v18.2d, v18.2s \n" \
|
||||
"fcvtl2 v21.2d, v20.4s \n" \
|
||||
"fcvtl v20.2d, v20.2s \n" \
|
||||
"fcvtl2 v23.2d, v22.4s \n" \
|
||||
"fcvtl v22.2d, v22.2s \n" \
|
||||
"fcvtl2 v25.2d, v24.4s \n" \
|
||||
"fcvtl v24.2d, v24.2s \n" \
|
||||
"fcvtl2 v27.2d, v26.4s \n" \
|
||||
"fcvtl v26.2d, v26.2s \n" \
|
||||
"fcvtl2 v29.2d, v28.4s \n" \
|
||||
"fcvtl v28.2d, v28.2s \n" \
|
||||
"fcvtl2 v31.2d, v30.4s \n" \
|
||||
"fcvtl v30.2d, v30.2s \n" \
|
||||
"fmla v0.2d, v16.2d, v16.2d \n" \
|
||||
"fmla v1.2d, v17.2d, v17.2d \n" \
|
||||
"fmla v2.2d, v18.2d, v18.2d \n" \
|
||||
"fmla v3.2d, v19.2d, v19.2d \n" \
|
||||
"fmla v4.2d, v20.2d, v20.2d \n" \
|
||||
"fmla v5.2d, v21.2d, v21.2d \n" \
|
||||
"fmla v6.2d, v22.2d, v22.2d \n" \
|
||||
"fmla v7.2d, v23.2d, v23.2d \n" \
|
||||
"fmla v0.2d, v24.2d, v24.2d \n" \
|
||||
"fmla v1.2d, v25.2d, v25.2d \n" \
|
||||
"fmla v2.2d, v26.2d, v26.2d \n" \
|
||||
"fmla v3.2d, v27.2d, v27.2d \n" \
|
||||
"fmla v4.2d, v28.2d, v28.2d \n" \
|
||||
"fmla v5.2d, v29.2d, v29.2d \n" \
|
||||
"fmla v6.2d, v30.2d, v30.2d \n" \
|
||||
"fmla v7.2d, v31.2d, v31.2d \n" \
|
||||
"prfm PLDL1KEEP, ["X", #1024] \n" \
|
||||
"prfm PLDL1KEEP, ["X", #1024+64] \n"
|
||||
|
||||
#define KERNEL_F_FINALIZE \
|
||||
"fadd v0.2d, v0.2d, v1.2d \n" \
|
||||
"fadd v2.2d, v2.2d, v3.2d \n" \
|
||||
"fadd v4.2d, v4.2d, v5.2d \n" \
|
||||
"fadd v6.2d, v6.2d, v7.2d \n" \
|
||||
"fadd v0.2d, v0.2d, v2.2d \n" \
|
||||
"fadd v4.2d, v4.2d, v6.2d \n" \
|
||||
"fadd v0.2d, v0.2d, v4.2d \n"
|
||||
|
||||
#define KERNEL_FINALIZE \
|
||||
"faddp "SSQD", v0.2d \n"
|
||||
|
||||
#define KERNEL_S1 \
|
||||
"ldr "TMPF", ["X"] \n" \
|
||||
"add "X", "X", "INC_X" \n" \
|
||||
"fcvtl v16.2d, v16.2s \n" \
|
||||
"fmla v0.2d, v16.2d, v16.2d \n"
|
||||
#endif
|
||||
|
||||
|
||||
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
|
@ -138,20 +234,19 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
" bne .Lnrm2_kernel_S_BEGIN \n"
|
||||
|
||||
".Lnrm2_kernel_F_BEGIN: \n"
|
||||
" asr "I", "N", #6 \n"
|
||||
" asr "I", "N", #"N_DIV_SHIFT" \n"
|
||||
" cmp "I", xzr \n"
|
||||
" beq .Lnrm2_kernel_S_BEGIN \n"
|
||||
|
||||
" .align 5 \n"
|
||||
".Lnrm2_kernel_F64: \n"
|
||||
" "KERNEL_F32" \n"
|
||||
" "KERNEL_F32" \n"
|
||||
".Lnrm2_kernel_F: \n"
|
||||
" "KERNEL_F" \n"
|
||||
" subs "I", "I", #1 \n"
|
||||
" bne .Lnrm2_kernel_F64 \n"
|
||||
" "KERNEL_F32_FINALIZE" \n"
|
||||
" bne .Lnrm2_kernel_F \n"
|
||||
" "KERNEL_F_FINALIZE" \n"
|
||||
|
||||
".Lnrm2_kernel_F1: \n"
|
||||
" ands "I", "N", #63 \n"
|
||||
" ands "I", "N", #"N_REM_MASK" \n"
|
||||
" ble .Lnrm2_kernel_L999 \n"
|
||||
|
||||
".Lnrm2_kernel_F10: \n"
|
||||
|
@ -161,7 +256,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
" b .Lnrm2_kernel_L999 \n"
|
||||
|
||||
".Lnrm2_kernel_S_BEGIN: \n"
|
||||
" lsl "INC_X", "INC_X", #2 \n"
|
||||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
||||
" asr "I", "N", #2 \n"
|
||||
" cmp "I", xzr \n"
|
||||
" ble .Lnrm2_kernel_S1 \n"
|
||||
|
@ -184,6 +279,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
" bne .Lnrm2_kernel_S10 \n"
|
||||
|
||||
".Lnrm2_kernel_L999: \n"
|
||||
" "KERNEL_FINALIZE" \n"
|
||||
" fmov %[RET_], "SSQD" \n"
|
||||
|
||||
: [RET_] "=r" (ret) //%0
|
||||
|
@ -214,13 +310,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
FLOAT dummy_alpha[2];
|
||||
#endif
|
||||
FLOAT nrm2 = 0.0;
|
||||
double nrm2_double = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||
if (n == 1) return fabs(x[0]);
|
||||
|
||||
#if defined(SMP)
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
@ -235,7 +330,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
double *ptr;
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, NULL, 0, result, 0,
|
Loading…
Reference in New Issue