THUNDERX2T99: Add parallel SCNRM2 Implementation

This commit is contained in:
Ashwin Sekhar T K 2017-02-14 04:10:06 -08:00
parent 8e89668f62
commit 228c75a69c
2 changed files with 119 additions and 20 deletions

View File

@ -19,8 +19,8 @@ ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c IDAMAXKERNEL = iamax_thunderx2t99.c
SNRM2KERNEL = snrm2_thunderx2t99.c SNRM2KERNEL = scnrm2_thunderx2t99.c
CNRM2KERNEL = cnrm2_thunderx2t99.S CNRM2KERNEL = scnrm2_thunderx2t99.c
DAXPYKERNEL = daxpy_thunderx2t99.S DAXPYKERNEL = daxpy_thunderx2t99.S

View File

@ -36,20 +36,29 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
void *c, BLASLONG ldc, int (*function)(), int nthreads); void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif #endif
#define N "x0" /* vector length */ #if !defined(COMPLEX)
#define X "x1" /* X vector address */ #define N "x0" /* vector length */
#define INC_X "x2" /* X stride */ #define X "x1" /* X vector address */
#define I "x5" /* loop variable */ #define INC_X "x2" /* X stride */
#define I "x5" /* loop variable */
#define TMPF "s16" #define TMPF "s16"
#define TMPFD "d17" #define TMPFD "d17"
#define SSQD "d0" #define SSQD "d0"
#define N_DIV_SHIFT "6"
#define N_REM_MASK "63"
#define INC_SHIFT "2"
#define KERNEL_F1 \ #define KERNEL_F1 \
"ldr "TMPF", ["X"], #4 \n" \ "ldr "TMPF", ["X"], #4 \n" \
"fcvt "TMPFD", "TMPF" \n" \ "fcvt "TMPFD", "TMPF" \n" \
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" "fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
#define KERNEL_F \
KERNEL_F32 \
KERNEL_F32
#define KERNEL_F32 \ #define KERNEL_F32 \
"ldur q16, ["X"] \n" \ "ldur q16, ["X"] \n" \
"ldur q18, ["X", #16] \n" \ "ldur q18, ["X", #16] \n" \
@ -95,7 +104,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
"prfm PLDL1KEEP, ["X", #1024] \n" \ "prfm PLDL1KEEP, ["X", #1024] \n" \
"prfm PLDL1KEEP, ["X", #1024+64] \n" "prfm PLDL1KEEP, ["X", #1024+64] \n"
#define KERNEL_F32_FINALIZE \ #define KERNEL_F_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \
@ -111,6 +120,93 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
"fcvt "TMPFD", "TMPF" \n" \ "fcvt "TMPFD", "TMPF" \n" \
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" "fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n"
#define KERNEL_FINALIZE \
""
#else
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define I "x5" /* loop variable */
#define TMPF "d16"
#define SSQD "d0"
#define N_DIV_SHIFT "4"
#define N_REM_MASK "15"
#define INC_SHIFT "3"
#define KERNEL_F1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", #8 \n" \
"fcvtl v16.2d, v16.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n"
#define KERNEL_F \
"ldur q16, ["X"] \n" \
"ldur q18, ["X", #16] \n" \
"ldur q20, ["X", #32] \n" \
"ldur q22, ["X", #48] \n" \
"ldur q24, ["X", #64] \n" \
"ldur q26, ["X", #80] \n" \
"ldur q28, ["X", #96] \n" \
"ldur q30, ["X", #112] \n" \
"add "X", "X", #128 \n" \
"fcvtl2 v17.2d, v16.4s \n" \
"fcvtl v16.2d, v16.2s \n" \
"fcvtl2 v19.2d, v18.4s \n" \
"fcvtl v18.2d, v18.2s \n" \
"fcvtl2 v21.2d, v20.4s \n" \
"fcvtl v20.2d, v20.2s \n" \
"fcvtl2 v23.2d, v22.4s \n" \
"fcvtl v22.2d, v22.2s \n" \
"fcvtl2 v25.2d, v24.4s \n" \
"fcvtl v24.2d, v24.2s \n" \
"fcvtl2 v27.2d, v26.4s \n" \
"fcvtl v26.2d, v26.2s \n" \
"fcvtl2 v29.2d, v28.4s \n" \
"fcvtl v28.2d, v28.2s \n" \
"fcvtl2 v31.2d, v30.4s \n" \
"fcvtl v30.2d, v30.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n" \
"fmla v1.2d, v17.2d, v17.2d \n" \
"fmla v2.2d, v18.2d, v18.2d \n" \
"fmla v3.2d, v19.2d, v19.2d \n" \
"fmla v4.2d, v20.2d, v20.2d \n" \
"fmla v5.2d, v21.2d, v21.2d \n" \
"fmla v6.2d, v22.2d, v22.2d \n" \
"fmla v7.2d, v23.2d, v23.2d \n" \
"fmla v0.2d, v24.2d, v24.2d \n" \
"fmla v1.2d, v25.2d, v25.2d \n" \
"fmla v2.2d, v26.2d, v26.2d \n" \
"fmla v3.2d, v27.2d, v27.2d \n" \
"fmla v4.2d, v28.2d, v28.2d \n" \
"fmla v5.2d, v29.2d, v29.2d \n" \
"fmla v6.2d, v30.2d, v30.2d \n" \
"fmla v7.2d, v31.2d, v31.2d \n" \
"prfm PLDL1KEEP, ["X", #1024] \n" \
"prfm PLDL1KEEP, ["X", #1024+64] \n"
#define KERNEL_F_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
"fadd v6.2d, v6.2d, v7.2d \n" \
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n"
#define KERNEL_FINALIZE \
"faddp "SSQD", v0.2d \n"
#define KERNEL_S1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"fcvtl v16.2d, v16.2s \n" \
"fmla v0.2d, v16.2d, v16.2d \n"
#endif
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
@ -138,20 +234,19 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" bne .Lnrm2_kernel_S_BEGIN \n" " bne .Lnrm2_kernel_S_BEGIN \n"
".Lnrm2_kernel_F_BEGIN: \n" ".Lnrm2_kernel_F_BEGIN: \n"
" asr "I", "N", #6 \n" " asr "I", "N", #"N_DIV_SHIFT" \n"
" cmp "I", xzr \n" " cmp "I", xzr \n"
" beq .Lnrm2_kernel_S_BEGIN \n" " beq .Lnrm2_kernel_S_BEGIN \n"
" .align 5 \n" " .align 5 \n"
".Lnrm2_kernel_F64: \n" ".Lnrm2_kernel_F: \n"
" "KERNEL_F32" \n" " "KERNEL_F" \n"
" "KERNEL_F32" \n"
" subs "I", "I", #1 \n" " subs "I", "I", #1 \n"
" bne .Lnrm2_kernel_F64 \n" " bne .Lnrm2_kernel_F \n"
" "KERNEL_F32_FINALIZE" \n" " "KERNEL_F_FINALIZE" \n"
".Lnrm2_kernel_F1: \n" ".Lnrm2_kernel_F1: \n"
" ands "I", "N", #63 \n" " ands "I", "N", #"N_REM_MASK" \n"
" ble .Lnrm2_kernel_L999 \n" " ble .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_F10: \n" ".Lnrm2_kernel_F10: \n"
@ -161,7 +256,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" b .Lnrm2_kernel_L999 \n" " b .Lnrm2_kernel_L999 \n"
".Lnrm2_kernel_S_BEGIN: \n" ".Lnrm2_kernel_S_BEGIN: \n"
" lsl "INC_X", "INC_X", #2 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" asr "I", "N", #2 \n" " asr "I", "N", #2 \n"
" cmp "I", xzr \n" " cmp "I", xzr \n"
" ble .Lnrm2_kernel_S1 \n" " ble .Lnrm2_kernel_S1 \n"
@ -184,6 +279,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" bne .Lnrm2_kernel_S10 \n" " bne .Lnrm2_kernel_S10 \n"
".Lnrm2_kernel_L999: \n" ".Lnrm2_kernel_L999: \n"
" "KERNEL_FINALIZE" \n"
" fmov %[RET_], "SSQD" \n" " fmov %[RET_], "SSQD" \n"
: [RET_] "=r" (ret) //%0 : [RET_] "=r" (ret) //%0
@ -214,13 +310,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
#if defined(SMP) #if defined(SMP)
int nthreads; int nthreads;
FLOAT dummy_alpha; FLOAT dummy_alpha[2];
#endif #endif
FLOAT nrm2 = 0.0; FLOAT nrm2 = 0.0;
double nrm2_double = 0.0; double nrm2_double = 0.0;
if (n <= 0 || inc_x <= 0) return 0.0; if (n <= 0 || inc_x <= 0) return 0.0;
if (n == 1) return fabs(x[0]);
#if defined(SMP) #if defined(SMP)
nthreads = num_cpu_avail(1); nthreads = num_cpu_avail(1);
@ -235,7 +330,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
char result[MAX_CPU_NUMBER * sizeof(double) * 2]; char result[MAX_CPU_NUMBER * sizeof(double) * 2];
double *ptr; double *ptr;
#if !defined(COMPLEX)
mode = BLAS_SINGLE | BLAS_REAL; mode = BLAS_SINGLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0, x, inc_x, NULL, 0, result, 0,