diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index fb19c0918..1a179f727 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -19,8 +19,8 @@ ISAMAXKERNEL = iamax_thunderx2t99.c IDAMAXKERNEL = iamax_thunderx2t99.c -SNRM2KERNEL = snrm2_thunderx2t99.c -CNRM2KERNEL = cnrm2_thunderx2t99.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c DAXPYKERNEL = daxpy_thunderx2t99.S diff --git a/kernel/arm64/snrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c similarity index 68% rename from kernel/arm64/snrm2_thunderx2t99.c rename to kernel/arm64/scnrm2_thunderx2t99.c index 913909a44..bf883e4d1 100644 --- a/kernel/arm64/snrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -36,20 +36,29 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif -#define N "x0" /* vector length */ -#define X "x1" /* X vector address */ -#define INC_X "x2" /* X stride */ -#define I "x5" /* loop variable */ +#if !defined(COMPLEX) +#define N "x0" /* vector length */ +#define X "x1" /* X vector address */ +#define INC_X "x2" /* X stride */ +#define I "x5" /* loop variable */ -#define TMPF "s16" -#define TMPFD "d17" -#define SSQD "d0" +#define TMPF "s16" +#define TMPFD "d17" +#define SSQD "d0" + +#define N_DIV_SHIFT "6" +#define N_REM_MASK "63" +#define INC_SHIFT "2" #define KERNEL_F1 \ "ldr "TMPF", ["X"], #4 \n" \ "fcvt "TMPFD", "TMPF" \n" \ "fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" +#define KERNEL_F \ + KERNEL_F32 \ + KERNEL_F32 + #define KERNEL_F32 \ "ldur q16, ["X"] \n" \ "ldur q18, ["X", #16] \n" \ @@ -95,7 +104,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n "prfm PLDL1KEEP, ["X", #1024] \n" \ "prfm PLDL1KEEP, ["X", #1024+64] \n" -#define KERNEL_F32_FINALIZE \ +#define KERNEL_F_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ @@ -111,6 +120,93 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n "fcvt "TMPFD", "TMPF" \n" \ "fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" +#define KERNEL_FINALIZE \ + "" + +#else + +#define N "x0" /* vector length */ +#define X "x1" /* X vector address */ +#define INC_X "x2" /* X stride */ +#define I "x5" /* loop variable */ + +#define TMPF "d16" +#define SSQD "d0" + +#define N_DIV_SHIFT "4" +#define N_REM_MASK "15" +#define INC_SHIFT "3" + +#define KERNEL_F1 \ + "ldr "TMPF", ["X"] \n" \ + "add "X", "X", #8 \n" \ + "fcvtl v16.2d, v16.2s \n" \ + "fmla v0.2d, v16.2d, v16.2d \n" + +#define KERNEL_F \ + "ldur q16, ["X"] \n" \ + "ldur q18, ["X", #16] \n" \ + "ldur q20, ["X", #32] \n" \ + "ldur q22, ["X", #48] \n" \ + "ldur q24, ["X", #64] \n" \ + "ldur q26, ["X", #80] \n" \ + "ldur q28, ["X", #96] \n" \ + "ldur q30, ["X", #112] \n" \ + "add "X", "X", #128 \n" \ + "fcvtl2 v17.2d, v16.4s \n" \ + "fcvtl v16.2d, v16.2s \n" \ + "fcvtl2 v19.2d, v18.4s \n" \ + "fcvtl v18.2d, v18.2s \n" \ + "fcvtl2 v21.2d, v20.4s \n" \ + "fcvtl v20.2d, v20.2s \n" \ + "fcvtl2 v23.2d, v22.4s \n" \ + "fcvtl v22.2d, v22.2s \n" \ + "fcvtl2 v25.2d, v24.4s \n" \ + "fcvtl v24.2d, v24.2s \n" \ + "fcvtl2 v27.2d, v26.4s \n" \ + "fcvtl v26.2d, v26.2s \n" \ + "fcvtl2 v29.2d, v28.4s \n" \ + "fcvtl v28.2d, v28.2s \n" \ + "fcvtl2 v31.2d, v30.4s \n" \ + "fcvtl v30.2d, v30.2s \n" \ + "fmla v0.2d, v16.2d, v16.2d \n" \ + "fmla v1.2d, v17.2d, v17.2d \n" \ + "fmla v2.2d, v18.2d, v18.2d \n" \ + "fmla v3.2d, v19.2d, v19.2d \n" \ + "fmla v4.2d, v20.2d, v20.2d \n" \ + "fmla v5.2d, v21.2d, v21.2d \n" \ + "fmla v6.2d, v22.2d, v22.2d \n" \ + "fmla v7.2d, v23.2d, v23.2d \n" \ + "fmla v0.2d, v24.2d, v24.2d \n" \ + "fmla v1.2d, v25.2d, v25.2d \n" \ + "fmla v2.2d, v26.2d, v26.2d \n" \ + "fmla v3.2d, v27.2d, v27.2d \n" \ + "fmla v4.2d, v28.2d, v28.2d \n" \ + "fmla v5.2d, v29.2d, v29.2d \n" \ + "fmla v6.2d, v30.2d, v30.2d \n" \ + "fmla v7.2d, v31.2d, v31.2d \n" \ + "prfm PLDL1KEEP, ["X", #1024] \n" \ + "prfm PLDL1KEEP, ["X", #1024+64] \n" + +#define KERNEL_F_FINALIZE \ + "fadd v0.2d, v0.2d, v1.2d \n" \ + "fadd v2.2d, v2.2d, v3.2d \n" \ + "fadd v4.2d, v4.2d, v5.2d \n" \ + "fadd v6.2d, v6.2d, v7.2d \n" \ + "fadd v0.2d, v0.2d, v2.2d \n" \ + "fadd v4.2d, v4.2d, v6.2d \n" \ + "fadd v0.2d, v0.2d, v4.2d \n" + +#define KERNEL_FINALIZE \ + "faddp "SSQD", v0.2d \n" + +#define KERNEL_S1 \ + "ldr "TMPF", ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "fcvtl v16.2d, v16.2s \n" \ + "fmla v0.2d, v16.2d, v16.2d \n" +#endif + static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -138,20 +234,19 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " bne .Lnrm2_kernel_S_BEGIN \n" ".Lnrm2_kernel_F_BEGIN: \n" - " asr "I", "N", #6 \n" + " asr "I", "N", #"N_DIV_SHIFT" \n" " cmp "I", xzr \n" " beq .Lnrm2_kernel_S_BEGIN \n" " .align 5 \n" - ".Lnrm2_kernel_F64: \n" - " "KERNEL_F32" \n" - " "KERNEL_F32" \n" + ".Lnrm2_kernel_F: \n" + " "KERNEL_F" \n" " subs "I", "I", #1 \n" - " bne .Lnrm2_kernel_F64 \n" - " "KERNEL_F32_FINALIZE" \n" + " bne .Lnrm2_kernel_F \n" + " "KERNEL_F_FINALIZE" \n" ".Lnrm2_kernel_F1: \n" - " ands "I", "N", #63 \n" + " ands "I", "N", #"N_REM_MASK" \n" " ble .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F10: \n" @@ -161,7 +256,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " b .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_S_BEGIN: \n" - " lsl "INC_X", "INC_X", #2 \n" + " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " asr "I", "N", #2 \n" " cmp "I", xzr \n" " ble .Lnrm2_kernel_S1 \n" @@ -184,6 +279,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " bne .Lnrm2_kernel_S10 \n" ".Lnrm2_kernel_L999: \n" + " "KERNEL_FINALIZE" \n" " fmov %[RET_], "SSQD" \n" : [RET_] "=r" (ret) //%0 @@ -214,13 +310,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; - FLOAT dummy_alpha; + FLOAT dummy_alpha[2]; #endif FLOAT nrm2 = 0.0; double nrm2_double = 0.0; if (n <= 0 || inc_x <= 0) return 0.0; - if (n == 1) return fabs(x[0]); #if defined(SMP) nthreads = num_cpu_avail(1); @@ -235,7 +330,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) char result[MAX_CPU_NUMBER * sizeof(double) * 2]; double *ptr; +#if !defined(COMPLEX) mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0,