Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop
This commit is contained in:
@@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
ifeq ($(CORE), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
@@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > minf )
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
|
||||
175
kernel/arm64/KERNEL.TSV110
Normal file
175
kernel/arm64/KERNEL.TSV110
Normal file
@@ -0,0 +1,175 @@
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
@@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > minf )
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
|
||||
@@ -129,7 +129,7 @@ LL(12):
|
||||
STFD f0, 14 * SIZE(CO1)
|
||||
STFD f0, 15 * SIZE(CO1)
|
||||
|
||||
dcbst PRE, CO1
|
||||
dcbtst PRE, CO1
|
||||
addi CO1, CO1, 16 * SIZE
|
||||
bdnz LL(12)
|
||||
.align 4
|
||||
|
||||
@@ -134,7 +134,7 @@ LL(12):
|
||||
STFD f0, 14 * SIZE(CO1)
|
||||
STFD f0, 15 * SIZE(CO1)
|
||||
|
||||
dcbst PRE, CO1
|
||||
dcbtst PRE, CO1
|
||||
addi CO1, CO1, 16 * SIZE
|
||||
bdnz LL(12)
|
||||
.align 4
|
||||
|
||||
@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
||||
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", // "0", "1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
||||
@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"0", "1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
||||
@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
"movsd %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
|
||||
@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
|
||||
@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
|
||||
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
|
||||
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
|
||||
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
|
||||
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
|
||||
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %2 \n\t"
|
||||
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
||||
"subq $8 , %1 \n\t"
|
||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||
"movsd %%xmm11,8(%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"movsd %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
|
||||
@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (n) // 0
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
|
||||
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
||||
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
||||
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha), // 2
|
||||
"r" (n2) // 3
|
||||
: "cc",
|
||||
|
||||
@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" cmpq $0, %0 \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
||||
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
||||
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
||||
|
||||
|
||||
" addq $8, %1 \n\t"
|
||||
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
|
||||
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
|
||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
||||
|
||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
|
||||
|
||||
" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
|
||||
" vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
|
||||
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
|
||||
|
||||
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
|
||||
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
|
||||
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
|
||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
|
||||
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
" jz 22f \n\t"
|
||||
|
||||
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
||||
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
||||
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
|
||||
|
||||
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
|
||||
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
|
||||
|
||||
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
|
||||
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
|
||||
|
||||
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
|
||||
|
||||
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
|
||||
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
|
||||
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
|
||||
|
||||
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
|
||||
" vmovups 32(%9), %%ymm4 \n\t"
|
||||
" vmovups 32(%3), %%ymm4 \n\t"
|
||||
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
|
||||
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
|
||||
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
|
||||
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"5: \n\t" // i = 0
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
|
||||
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups %%ymm8 , (%8) \n\t" // write a
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vmovups %%ymm8 , (%2) \n\t" // write a
|
||||
" vmovups %%ymm8 , (%4) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
|
||||
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
|
||||
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
|
||||
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
|
||||
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm9 , (%8) \n\t" // write a
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm9 , (%2) \n\t" // write a
|
||||
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
|
||||
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
|
||||
" vmovups (%9), %%ymm0 \n\t"
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm10, (%8) \n\t" // write a
|
||||
" vmovups (%3), %%ymm0 \n\t"
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm10, (%2) \n\t" // write a
|
||||
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
|
||||
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
|
||||
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm11, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm11, (%2) \n\t" // write a
|
||||
" vmovups %%ymm11, (%5) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
|
||||
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm12, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm12, (%2) \n\t" // write a
|
||||
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
|
||||
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm13, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm13, (%2) \n\t" // write a
|
||||
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
|
||||
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
|
||||
" addq $64, %9 \n\t" // b=b+8
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $64, %3 \n\t" // b=b+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
|
||||
" vmovups 32(%9), %%ymm1 \n\t"
|
||||
" vmovups %%ymm14, (%8) \n\t" // write a
|
||||
" vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vmovups %%ymm14, (%2) \n\t" // write a
|
||||
" vmovups %%ymm14, (%6) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
|
||||
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
" addq $32, %8 \n\t" // a=a+8
|
||||
" addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
|
||||
" vmovups %%ymm15, (%8) \n\t" // write a
|
||||
" vmovups %%ymm15, (%2) \n\t" // write a
|
||||
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
|
||||
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c3), // 5
|
||||
"r" (c6), // 6
|
||||
"r" (ldc), // 7
|
||||
"r" (as), // 8
|
||||
"r" (bs) // 9
|
||||
"r" (a), // 8
|
||||
"r" (b) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
||||
@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" prefetcht0 384(%3,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" prefetcht0 384(%7,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
" jz 2f \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
" jz 2f \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
" jz 2f \n\t"
|
||||
|
||||
" prefetcht0 384(%2,%1,8) \n\t"
|
||||
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" prefetcht0 384(%6,%1,8) \n\t"
|
||||
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
|
||||
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"3: \n\t" // i = 1
|
||||
|
||||
" vmovddup (%7), %%xmm1 \n\t" // read b
|
||||
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
|
||||
" vmovddup (%3), %%xmm1 \n\t" // read b
|
||||
" vmovddup 8(%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
|
||||
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
|
||||
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
|
||||
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
|
||||
|
||||
" vmovups %%xmm12 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%6) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%6) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%6) \n\t" // write a
|
||||
" vmovups %%xmm12 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%2) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%2) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%2) \n\t" // write a
|
||||
|
||||
" vmovups %%xmm12 , (%5) \n\t" // write c1
|
||||
" vmovups %%xmm13 , 16(%5) \n\t"
|
||||
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
|
||||
|
||||
" \n\t" // i = 0
|
||||
" subq $16 , %7 \n\t" // b = b - 2
|
||||
" subq $64 , %6 \n\t" // a = a - 8
|
||||
" subq $16 , %3 \n\t" // b = b - 2
|
||||
" subq $64 , %2 \n\t" // a = a - 8
|
||||
|
||||
" vmovddup (%7), %%xmm0 \n\t" // read bb
|
||||
" vmovddup (%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
|
||||
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
|
||||
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
|
||||
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%6) \n\t"
|
||||
" vmovups %%xmm10 , 32(%6) \n\t"
|
||||
" vmovups %%xmm11 , 48(%6) \n\t"
|
||||
" vmovups %%xmm8 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%2) \n\t"
|
||||
" vmovups %%xmm10 , 32(%2) \n\t"
|
||||
" vmovups %%xmm11 , 48(%2) \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%4) \n\t" // write c0
|
||||
" vmovups %%xmm9 , 16(%4) \n\t"
|
||||
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (as), // 6
|
||||
"r" (bs) // 7
|
||||
"r" (a), // 6
|
||||
"r" (b) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
||||
@@ -135,7 +135,7 @@
|
||||
#endif
|
||||
|
||||
movq %rsp, %rbx # save old stack
|
||||
subq $128 + LOCAL_BUFFER_SIZE, %rsp
|
||||
subq $256 + LOCAL_BUFFER_SIZE, %rsp
|
||||
andq $-4096, %rsp # align stack
|
||||
|
||||
STACK_TOUCHING
|
||||
|
||||
@@ -383,7 +383,7 @@
|
||||
EMMS
|
||||
|
||||
movq %rsp, %rbx # save old stack
|
||||
subq $128 + LOCAL_BUFFER_SIZE, %rsp
|
||||
subq $256 + LOCAL_BUFFER_SIZE, %rsp
|
||||
andq $-4096, %rsp # align stack
|
||||
|
||||
STACK_TOUCHING
|
||||
|
||||
@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovss %%xmm4, (%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
"movss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovss %%xmm4, (%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||
|
||||
"3: \n\t"
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n1) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (src), // 2
|
||||
"r" (dest) // 3
|
||||
: "cc",
|
||||
|
||||
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastss (%2), %%xmm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
|
||||
"vbroadcastss (%3), %%xmm12 \n\t" // x0
|
||||
"vbroadcastss 4(%3), %%xmm13 \n\t" // x1
|
||||
"vbroadcastss 8(%3), %%xmm14 \n\t" // x2
|
||||
"vbroadcastss 12(%3), %%xmm15 \n\t" // x3
|
||||
"vbroadcastss 16(%3), %%xmm0 \n\t" // x4
|
||||
"vbroadcastss 20(%3), %%xmm1 \n\t" // x5
|
||||
"vbroadcastss 24(%3), %%xmm2 \n\t" // x6
|
||||
"vbroadcastss 28(%3), %%xmm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||
|
||||
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"addq $4 , %8 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
|
||||
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
|
||||
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
|
||||
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %2 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
||||
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
|
||||
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%8,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
".align 2 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%8,%2,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
|
||||
"addq $16, %8 \n\t"
|
||||
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
|
||||
"addq $16, %2 \n\t"
|
||||
"vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
@@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
"vbroadcastss (%3), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
@@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %2 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
@@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $16, %8 \n\t"
|
||||
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
|
||||
"addq $16, %2 \n\t"
|
||||
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
|
||||
"subq $16, %1 \n\t"
|
||||
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
@@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
@@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
@@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
|
||||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
|
||||
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2), %%xmm12 \n\t" // x0
|
||||
"movss 4(%2), %%xmm13 \n\t" // x1
|
||||
"movss 8(%2), %%xmm14 \n\t" // x2
|
||||
"movss 12(%2), %%xmm15 \n\t" // x3
|
||||
"movss (%3), %%xmm12 \n\t" // x0
|
||||
"movss 4(%3), %%xmm13 \n\t" // x1
|
||||
"movss 8(%3), %%xmm14 \n\t" // x2
|
||||
"movss 12(%3), %%xmm15 \n\t" // x3
|
||||
"shufps $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufps $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufps $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
"movss 16(%2), %%xmm0 \n\t" // x4
|
||||
"movss 20(%2), %%xmm1 \n\t" // x5
|
||||
"movss 24(%2), %%xmm2 \n\t" // x6
|
||||
"movss 28(%2), %%xmm3 \n\t" // x7
|
||||
"movss 16(%3), %%xmm0 \n\t" // x4
|
||||
"movss 20(%3), %%xmm1 \n\t" // x5
|
||||
"movss 24(%3), %%xmm2 \n\t" // x6
|
||||
"movss 28(%3), %%xmm3 \n\t" // x7
|
||||
"shufps $0, %%xmm0 , %%xmm0 \n\t"
|
||||
"shufps $0, %%xmm1 , %%xmm1 \n\t"
|
||||
"shufps $0, %%xmm2 , %%xmm2 \n\t"
|
||||
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"1: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
".p2align 1 \n\t"
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
"movups (%5,%0,4), %%xmm9 \n\t"
|
||||
"movups (%6,%0,4), %%xmm10 \n\t"
|
||||
"movups (%7,%0,4), %%xmm11 \n\t"
|
||||
"movups (%5,%0,4), %%xmm8 \n\t"
|
||||
"movups (%6,%0,4), %%xmm9 \n\t"
|
||||
"movups (%7,%0,4), %%xmm10 \n\t"
|
||||
"movups (%8,%0,4), %%xmm11 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm13, %%xmm9 \n\t"
|
||||
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"movups (%4,%8,4), %%xmm8 \n\t"
|
||||
"movups (%5,%8,4), %%xmm9 \n\t"
|
||||
"movups (%6,%8,4), %%xmm10 \n\t"
|
||||
"movups (%7,%8,4), %%xmm11 \n\t"
|
||||
"movups (%5,%2,4), %%xmm8 \n\t"
|
||||
"movups (%6,%2,4), %%xmm9 \n\t"
|
||||
"movups (%7,%2,4), %%xmm10 \n\t"
|
||||
"movups (%8,%2,4), %%xmm11 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"mulps %%xmm0 , %%xmm8 \n\t"
|
||||
"mulps %%xmm1 , %%xmm9 \n\t"
|
||||
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %2 \n\t"
|
||||
"addps %%xmm5 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"mulps %%xmm6 , %%xmm4 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm4 , %%xmm7 \n\t"
|
||||
|
||||
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
"movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
"vbroadcastss (%3), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
"vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
|
||||
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
|
||||
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
|
||||
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
|
||||
"vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
|
||||
"vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
|
||||
"vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
|
||||
"vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4, %8 \n\t"
|
||||
"addq $4, %2 \n\t"
|
||||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8, %8 \n\t"
|
||||
"addq $8, %2 \n\t"
|
||||
"addq $8, %0 \n\t"
|
||||
"subq $8, %1 \n\t"
|
||||
|
||||
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%8,%0,4) \n\t"
|
||||
"vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
"vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%8,4) \n\t"
|
||||
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%8,4) \n\t"
|
||||
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
|
||||
"prefetcht0 192(%5,%2,4) \n\t"
|
||||
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%6,%2,4) \n\t"
|
||||
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%6,%8,4) \n\t"
|
||||
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%7,%8,4) \n\t"
|
||||
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"prefetcht0 192(%7,%2,4) \n\t"
|
||||
"vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
|
||||
"vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%8,%2,4) \n\t"
|
||||
"vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
|
||||
"vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
"vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
"vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
|
||||
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $16, %8 \n\t"
|
||||
"addq $16, %2 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
"+r" (n), // 1
|
||||
"+r" (lda4) // 2
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (x), // 3
|
||||
"r" (y), // 4
|
||||
"r" (ap[0]), // 5
|
||||
"r" (ap[1]), // 6
|
||||
"r" (ap[2]), // 7
|
||||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||
"movss %%xmm11,4(%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"movss %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
|
||||
@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
|
||||
@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
|
||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"+r" (from) // 0
|
||||
:
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
|
||||
@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"3: \n\t"
|
||||
|
||||
" vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
|
||||
" vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
|
||||
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
|
||||
" vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
|
||||
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
|
||||
" vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
|
||||
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
|
||||
" vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
|
||||
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
|
||||
" vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
|
||||
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
|
||||
" vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
|
||||
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i]
|
||||
" vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i]
|
||||
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i]
|
||||
" vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i]
|
||||
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i]
|
||||
" vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i]
|
||||
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i]
|
||||
" vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i]
|
||||
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i]
|
||||
" vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i]
|
||||
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i]
|
||||
" vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i]
|
||||
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i]
|
||||
" vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i]
|
||||
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i]
|
||||
" vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i]
|
||||
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i]
|
||||
" vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i]
|
||||
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
|
||||
" subq $64 , %6 \n\t" // a -= m
|
||||
" subq $8 , %7 \n\t" // b -= n
|
||||
" subq $64 , %2 \n\t" // a -= m
|
||||
" subq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i]
|
||||
" vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i]
|
||||
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (as), // 6
|
||||
"r" (bs) // 7
|
||||
"r" (a), // 6
|
||||
"r" (b) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
||||
@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"3: \n\t"
|
||||
|
||||
" vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i]
|
||||
" vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i]
|
||||
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i]
|
||||
" vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i]
|
||||
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i]
|
||||
" vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i]
|
||||
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i]
|
||||
" vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i]
|
||||
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i]
|
||||
" vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i]
|
||||
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i]
|
||||
" vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i]
|
||||
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i]
|
||||
" vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i]
|
||||
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
|
||||
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i]
|
||||
" vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i]
|
||||
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i]
|
||||
" vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i]
|
||||
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i]
|
||||
" vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i]
|
||||
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
|
||||
" vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
|
||||
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
|
||||
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
|
||||
" vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
|
||||
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
|
||||
" vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
|
||||
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
|
||||
" vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
|
||||
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
|
||||
" vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
|
||||
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
|
||||
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
|
||||
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
|
||||
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
|
||||
|
||||
" addq $64 , %6 \n\t" // a -= m
|
||||
" addq $8 , %7 \n\t" // b -= n
|
||||
" addq $64 , %2 \n\t" // a -= m
|
||||
" addq $8 , %3 \n\t" // b -= n
|
||||
|
||||
" vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
|
||||
" vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
|
||||
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
|
||||
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
|
||||
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
|
||||
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
|
||||
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
|
||||
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
|
||||
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
|
||||
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
|
||||
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (as), // 6
|
||||
"r" (bs) // 7
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (a), // 6
|
||||
"r" (b) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
||||
@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"3: \n\t" // i = 0
|
||||
|
||||
" vbroadcastss (%7), %%xmm0 \n\t" // read bb
|
||||
" vbroadcastss 4(%7), %%xmm1 \n\t" // read b
|
||||
" vbroadcastss (%3), %%xmm0 \n\t" // read bb
|
||||
" vbroadcastss 4(%3), %%xmm1 \n\t" // read b
|
||||
|
||||
" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
|
||||
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
|
||||
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
|
||||
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%6) \n\t"
|
||||
" vmovups %%xmm10 , 32(%6) \n\t"
|
||||
" vmovups %%xmm11 , 48(%6) \n\t"
|
||||
" vmovups %%xmm8 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%2) \n\t"
|
||||
" vmovups %%xmm10 , 32(%2) \n\t"
|
||||
" vmovups %%xmm11 , 48(%2) \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%4) \n\t" // write c0
|
||||
" vmovups %%xmm9 , 16(%4) \n\t"
|
||||
@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t"
|
||||
|
||||
" \n\t" // i = 1
|
||||
" addq $8 , %7 \n\t" // b = b + 2
|
||||
" addq $64 , %6 \n\t" // a = a + 16
|
||||
" addq $8 , %3 \n\t" // b = b + 2
|
||||
" addq $64 , %2 \n\t" // a = a + 16
|
||||
|
||||
" vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
|
||||
" vbroadcastss 4(%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
|
||||
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
|
||||
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
|
||||
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
|
||||
|
||||
" vmovups %%xmm12 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%6) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%6) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%6) \n\t" // write a
|
||||
" vmovups %%xmm12 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%2) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%2) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%2) \n\t" // write a
|
||||
|
||||
" vmovups %%xmm12 , (%5) \n\t" // write c1
|
||||
" vmovups %%xmm13 , 16(%5) \n\t"
|
||||
@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (as), // 6
|
||||
"r" (bs) // 7
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (a), // 6
|
||||
"r" (b) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
||||
@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%2,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
|
||||
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
|
||||
" vmovups (%6,%1,8), %%xmm4 \n\t"
|
||||
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
|
||||
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
|
||||
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
|
||||
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
|
||||
|
||||
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
|
||||
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
|
||||
@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"3: \n\t" // i = 1
|
||||
|
||||
" vbroadcastss (%7), %%xmm1 \n\t" // read b
|
||||
" vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
|
||||
" vbroadcastss (%3), %%xmm1 \n\t" // read b
|
||||
" vbroadcastss 4(%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
|
||||
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
|
||||
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
|
||||
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
|
||||
|
||||
" vmovups %%xmm12 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%6) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%6) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%6) \n\t" // write a
|
||||
" vmovups %%xmm12 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm13 , 16(%2) \n\t" // write a
|
||||
" vmovups %%xmm14 , 32(%2) \n\t" // write a
|
||||
" vmovups %%xmm15 , 48(%2) \n\t" // write a
|
||||
|
||||
" vmovups %%xmm12 , (%5) \n\t" // write c1
|
||||
" vmovups %%xmm13 , 16(%5) \n\t"
|
||||
@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
|
||||
|
||||
" \n\t" // i = 0
|
||||
" subq $8 , %7 \n\t" // b = b - 2
|
||||
" subq $64 , %6 \n\t" // a = a - 16
|
||||
" subq $8 , %3 \n\t" // b = b - 2
|
||||
" subq $64 , %2 \n\t" // a = a - 16
|
||||
|
||||
" vbroadcastss (%7), %%xmm0 \n\t" // read bb
|
||||
" vbroadcastss (%3), %%xmm0 \n\t" // read bb
|
||||
|
||||
" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
|
||||
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
|
||||
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
|
||||
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%6) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%6) \n\t"
|
||||
" vmovups %%xmm10 , 32(%6) \n\t"
|
||||
" vmovups %%xmm11 , 48(%6) \n\t"
|
||||
" vmovups %%xmm8 , (%2) \n\t" // write a
|
||||
" vmovups %%xmm9 , 16(%2) \n\t"
|
||||
" vmovups %%xmm10 , 32(%2) \n\t"
|
||||
" vmovups %%xmm11 , 48(%2) \n\t"
|
||||
|
||||
" vmovups %%xmm8 , (%4) \n\t" // write c0
|
||||
" vmovups %%xmm9 , 16(%4) \n\t"
|
||||
@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n1), // 0
|
||||
"+a" (i), // 1
|
||||
"+r" (as), // 2
|
||||
"+r" (bs) // 3
|
||||
:
|
||||
"r" (n1), // 0
|
||||
"a" (i), // 1
|
||||
"r" (a), // 2
|
||||
"r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (as), // 6
|
||||
"r" (bs) // 7
|
||||
"r" (c), // 4
|
||||
"r" (c1), // 5
|
||||
"r" (a), // 6
|
||||
"r" (b) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
|
||||
@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha), // 4
|
||||
|
||||
@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -107,10 +107,10 @@ if ( n < 1280 )
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -199,10 +199,10 @@ if ( n < 1280 )
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
"vmovups %%xmm4, 16(%4) \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
:
|
||||
"+r" (i), // 0
|
||||
"+r" (n) // 1
|
||||
:
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
|
||||
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
|
||||
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
||||
@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"+r" (n), // 0
|
||||
"+r" (x) // 1
|
||||
:
|
||||
"r" (alpha) // 2
|
||||
: "cc", //"%0", "%1",
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = damax_z13.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = zamax_z13.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = damin_z13.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = zamin_z13.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = dmax_z13.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = dmin_z13.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
@@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = idmax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = idmin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
|
||||
146
kernel/zarch/KERNEL.Z14
Normal file
146
kernel/zarch/KERNEL.Z14
Normal file
@@ -0,0 +1,146 @@
|
||||
SAMAXKERNEL = samax.c
|
||||
DAMAXKERNEL = damax.c
|
||||
CAMAXKERNEL = camax.c
|
||||
ZAMAXKERNEL = zamax.c
|
||||
|
||||
SAMINKERNEL = samin.c
|
||||
DAMINKERNEL = damin.c
|
||||
CAMINKERNEL = camin.c
|
||||
ZAMINKERNEL = zamin.c
|
||||
|
||||
SMAXKERNEL = smax.c
|
||||
DMAXKERNEL = dmax.c
|
||||
|
||||
SMINKERNEL = smin.c
|
||||
DMINKERNEL = dmin.c
|
||||
|
||||
ISAMAXKERNEL = isamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
|
||||
ISAMINKERNEL = isamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
|
||||
ISMAXKERNEL = ismax.c
|
||||
IDMAXKERNEL = idmax.c
|
||||
|
||||
ISMINKERNEL = ismin.c
|
||||
IDMINKERNEL = idmin.c
|
||||
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
DSDOTKERNEL = dsdot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = cscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
CGEMVNKERNEL = cgemv_n_4.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
CGEMVTKERNEL = cgemv_t_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
STRMMKERNEL = strmm8x4V.S
|
||||
DTRMMKERNEL = trmm8x4V.S
|
||||
CTRMMKERNEL = ctrmm4x4V.S
|
||||
ZTRMMKERNEL = ztrmm4x4V.S
|
||||
|
||||
SGEMMKERNEL = strmm8x4V.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
|
||||
|
||||
DGEMMKERNEL = gemm8x4V.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ctrmm4x4V.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ztrmm4x4V.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
215
kernel/zarch/camax.c
Normal file
215
kernel/zarch/camax.c
Normal file
@@ -0,0 +1,215 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
|
||||
|
||||
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT amax;
|
||||
|
||||
__asm__("vlef %%v0,0(%[x]),0\n\t"
|
||||
"vlef %%v16,4(%[x]),0\n\t"
|
||||
"vlef %%v0,8(%[x]),1\n\t"
|
||||
"vlef %%v16,12(%[x]),1\n\t"
|
||||
"vlef %%v0,16(%[x]),2\n\t"
|
||||
"vlef %%v16,20(%[x]),2\n\t"
|
||||
"vlef %%v0,24(%[x]),3\n\t"
|
||||
"vlef %%v16,28(%[x]),3\n\t"
|
||||
"vflpsb %%v0,%%v0\n\t"
|
||||
"vflpsb %%v16,%%v16\n\t"
|
||||
"vfasb %%v0,%%v0,%%v16\n\t"
|
||||
"vleib %%v1,0,0\n\t"
|
||||
"vleib %%v1,1,1\n\t"
|
||||
"vleib %%v1,2,2\n\t"
|
||||
"vleib %%v1,3,3\n\t"
|
||||
"vleib %%v1,8,4\n\t"
|
||||
"vleib %%v1,9,5\n\t"
|
||||
"vleib %%v1,10,6\n\t"
|
||||
"vleib %%v1,11,7\n\t"
|
||||
"vleib %%v1,16,8\n\t"
|
||||
"vleib %%v1,17,9\n\t"
|
||||
"vleib %%v1,18,10\n\t"
|
||||
"vleib %%v1,19,11\n\t"
|
||||
"vleib %%v1,24,12\n\t"
|
||||
"vleib %%v1,25,13\n\t"
|
||||
"vleib %%v1,26,14\n\t"
|
||||
"vleib %%v1,27,15\n\t"
|
||||
"srlg %[n],%[n],5\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||
"vl %%v2,16(%%r1,%[x])\n\t"
|
||||
"vpkg %%v17,%%v16,%%v2\n\t"
|
||||
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
|
||||
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||
"vl %%v2,48(%%r1,%[x])\n\t"
|
||||
"vpkg %%v19,%%v18,%%v2\n\t"
|
||||
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
|
||||
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||
"vl %%v2,80(%%r1,%[x])\n\t"
|
||||
"vpkg %%v21,%%v20,%%v2\n\t"
|
||||
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
|
||||
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||
"vl %%v2,112(%%r1,%[x])\n\t"
|
||||
"vpkg %%v23,%%v22,%%v2\n\t"
|
||||
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
|
||||
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||
"vl %%v2,144(%%r1,%[x])\n\t"
|
||||
"vpkg %%v25,%%v24,%%v2\n\t"
|
||||
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
|
||||
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||
"vl %%v2,176(%%r1,%[x])\n\t"
|
||||
"vpkg %%v27,%%v26,%%v2\n\t"
|
||||
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
|
||||
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||
"vl %%v2,208(%%r1,%[x])\n\t"
|
||||
"vpkg %%v29,%%v28,%%v2\n\t"
|
||||
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
|
||||
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||
"vl %%v2,240(%%r1,%[x])\n\t"
|
||||
"vpkg %%v31,%%v30,%%v2\n\t"
|
||||
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
|
||||
"vflpsb %%v16,%%v16\n\t"
|
||||
"vflpsb %%v17,%%v17\n\t"
|
||||
"vflpsb %%v18,%%v18\n\t"
|
||||
"vflpsb %%v19,%%v19\n\t"
|
||||
"vflpsb %%v20,%%v20\n\t"
|
||||
"vflpsb %%v21,%%v21\n\t"
|
||||
"vflpsb %%v22,%%v22\n\t"
|
||||
"vflpsb %%v23,%%v23\n\t"
|
||||
"vflpsb %%v24,%%v24\n\t"
|
||||
"vflpsb %%v25,%%v25\n\t"
|
||||
"vflpsb %%v26,%%v26\n\t"
|
||||
"vflpsb %%v27,%%v27\n\t"
|
||||
"vflpsb %%v28,%%v28\n\t"
|
||||
"vflpsb %%v29,%%v29\n\t"
|
||||
"vflpsb %%v30,%%v30\n\t"
|
||||
"vflpsb %%v31,%%v31\n\t"
|
||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||
"vfasb %%v18,%%v18,%%v19\n\t"
|
||||
"vfasb %%v20,%%v20,%%v21\n\t"
|
||||
"vfasb %%v22,%%v22,%%v23\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vfasb %%v26,%%v26,%%v27\n\t"
|
||||
"vfasb %%v28,%%v28,%%v29\n\t"
|
||||
"vfasb %%v30,%%v30,%%v31\n\t"
|
||||
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
|
||||
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
|
||||
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
|
||||
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
|
||||
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||
"agfi %%r1, 256\n\t"
|
||||
"brctg %[n], 0b\n\t"
|
||||
"veslg %%v16,%%v0,32\n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||
"vrepf %%v16,%%v0,2\n\t"
|
||||
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||
"ler %[amax],%%f0"
|
||||
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = camax_kernel_32(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
} else {
|
||||
maxf = CABS1(x, 0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x, ix) > maxf) {
|
||||
maxf = CABS1(x, ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf = CABS1(x, 0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x, ix) > maxf) {
|
||||
maxf = CABS1(x, ix);
|
||||
}
|
||||
if (CABS1(x, ix + inc_x2) > maxf) {
|
||||
maxf = CABS1(x, ix + inc_x2);
|
||||
}
|
||||
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
|
||||
maxf = CABS1(x, ix + inc_x2 * 2);
|
||||
}
|
||||
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
|
||||
maxf = CABS1(x, ix + inc_x2 * 3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x, ix) > maxf) {
|
||||
maxf = CABS1(x, ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
215
kernel/zarch/camin.c
Normal file
215
kernel/zarch/camin.c
Normal file
@@ -0,0 +1,215 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
|
||||
|
||||
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT amin;
|
||||
|
||||
__asm__("vlef %%v0,0(%[x]),0\n\t"
|
||||
"vlef %%v16,4(%[x]),0\n\t"
|
||||
"vlef %%v0,8(%[x]),1\n\t"
|
||||
"vlef %%v16,12(%[x]),1\n\t"
|
||||
"vlef %%v0,16(%[x]),2\n\t"
|
||||
"vlef %%v16,20(%[x]),2\n\t"
|
||||
"vlef %%v0,24(%[x]),3\n\t"
|
||||
"vlef %%v16,28(%[x]),3\n\t"
|
||||
"vflpsb %%v0,%%v0\n\t"
|
||||
"vflpsb %%v16,%%v16\n\t"
|
||||
"vfasb %%v0,%%v0,%%v16\n\t"
|
||||
"vleib %%v1,0,0\n\t"
|
||||
"vleib %%v1,1,1\n\t"
|
||||
"vleib %%v1,2,2\n\t"
|
||||
"vleib %%v1,3,3\n\t"
|
||||
"vleib %%v1,8,4\n\t"
|
||||
"vleib %%v1,9,5\n\t"
|
||||
"vleib %%v1,10,6\n\t"
|
||||
"vleib %%v1,11,7\n\t"
|
||||
"vleib %%v1,16,8\n\t"
|
||||
"vleib %%v1,17,9\n\t"
|
||||
"vleib %%v1,18,10\n\t"
|
||||
"vleib %%v1,19,11\n\t"
|
||||
"vleib %%v1,24,12\n\t"
|
||||
"vleib %%v1,25,13\n\t"
|
||||
"vleib %%v1,26,14\n\t"
|
||||
"vleib %%v1,27,15\n\t"
|
||||
"srlg %[n],%[n],5\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||
"vl %%v2,16(%%r1,%[x])\n\t"
|
||||
"vpkg %%v17,%%v16,%%v2\n\t"
|
||||
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
|
||||
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||
"vl %%v2,48(%%r1,%[x])\n\t"
|
||||
"vpkg %%v19,%%v18,%%v2\n\t"
|
||||
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
|
||||
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||
"vl %%v2,80(%%r1,%[x])\n\t"
|
||||
"vpkg %%v21,%%v20,%%v2\n\t"
|
||||
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
|
||||
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||
"vl %%v2,112(%%r1,%[x])\n\t"
|
||||
"vpkg %%v23,%%v22,%%v2\n\t"
|
||||
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
|
||||
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||
"vl %%v2,144(%%r1,%[x])\n\t"
|
||||
"vpkg %%v25,%%v24,%%v2\n\t"
|
||||
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
|
||||
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||
"vl %%v2,176(%%r1,%[x])\n\t"
|
||||
"vpkg %%v27,%%v26,%%v2\n\t"
|
||||
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
|
||||
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||
"vl %%v2,208(%%r1,%[x])\n\t"
|
||||
"vpkg %%v29,%%v28,%%v2\n\t"
|
||||
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
|
||||
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||
"vl %%v2,240(%%r1,%[x])\n\t"
|
||||
"vpkg %%v31,%%v30,%%v2\n\t"
|
||||
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
|
||||
"vflpsb %%v16,%%v16\n\t"
|
||||
"vflpsb %%v17,%%v17\n\t"
|
||||
"vflpsb %%v18,%%v18\n\t"
|
||||
"vflpsb %%v19,%%v19\n\t"
|
||||
"vflpsb %%v20,%%v20\n\t"
|
||||
"vflpsb %%v21,%%v21\n\t"
|
||||
"vflpsb %%v22,%%v22\n\t"
|
||||
"vflpsb %%v23,%%v23\n\t"
|
||||
"vflpsb %%v24,%%v24\n\t"
|
||||
"vflpsb %%v25,%%v25\n\t"
|
||||
"vflpsb %%v26,%%v26\n\t"
|
||||
"vflpsb %%v27,%%v27\n\t"
|
||||
"vflpsb %%v28,%%v28\n\t"
|
||||
"vflpsb %%v29,%%v29\n\t"
|
||||
"vflpsb %%v30,%%v30\n\t"
|
||||
"vflpsb %%v31,%%v31\n\t"
|
||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||
"vfasb %%v18,%%v18,%%v19\n\t"
|
||||
"vfasb %%v20,%%v20,%%v21\n\t"
|
||||
"vfasb %%v22,%%v22,%%v23\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vfasb %%v26,%%v26,%%v27\n\t"
|
||||
"vfasb %%v28,%%v28,%%v29\n\t"
|
||||
"vfasb %%v30,%%v30,%%v31\n\t"
|
||||
"vfminsb %%v16,%%v16,%%v24,0\n\t"
|
||||
"vfminsb %%v18,%%v18,%%v26,0\n\t"
|
||||
"vfminsb %%v20,%%v20,%%v28,0\n\t"
|
||||
"vfminsb %%v22,%%v22,%%v30,0\n\t"
|
||||
"vfminsb %%v16,%%v16,%%v20,0\n\t"
|
||||
"vfminsb %%v18,%%v18,%%v22,0\n\t"
|
||||
"vfminsb %%v16,%%v16,%%v18,0\n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||
"agfi %%r1, 256\n\t"
|
||||
"brctg %[n], 0b\n\t"
|
||||
"veslg %%v16,%%v0,32\n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||
"vrepf %%v16,%%v0,2\n\t"
|
||||
"wfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||
"ler %[amin],%%f0"
|
||||
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = camin_kernel_32(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
} else {
|
||||
minf = CABS1(x, 0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x, ix) < minf) {
|
||||
minf = CABS1(x, ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf = CABS1(x, 0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x, ix) < minf) {
|
||||
minf = CABS1(x, ix);
|
||||
}
|
||||
if (CABS1(x, ix + inc_x2) < minf) {
|
||||
minf = CABS1(x, ix + inc_x2);
|
||||
}
|
||||
if (CABS1(x, ix + inc_x2 * 2) < minf) {
|
||||
minf = CABS1(x, ix + inc_x2 * 2);
|
||||
}
|
||||
if (CABS1(x, ix + inc_x2 * 3) < minf) {
|
||||
minf = CABS1(x, ix + inc_x2 * 3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x, ix) < minf) {
|
||||
minf = CABS1(x, ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
155
kernel/zarch/casum.c
Normal file
155
kernel/zarch/casum.c
Normal file
@@ -0,0 +1,155 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT asum;
|
||||
|
||||
__asm__("vzero %%v24\n\t"
|
||||
"vzero %%v25\n\t"
|
||||
"vzero %%v26\n\t"
|
||||
"vzero %%v27\n\t"
|
||||
"vzero %%v28\n\t"
|
||||
"vzero %%v29\n\t"
|
||||
"vzero %%v30\n\t"
|
||||
"vzero %%v31\n\t"
|
||||
"srlg %[n],%[n],5\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||
"vflpsb %%v16, %%v16\n\t"
|
||||
"vflpsb %%v17, %%v17\n\t"
|
||||
"vflpsb %%v18, %%v18\n\t"
|
||||
"vflpsb %%v19, %%v19\n\t"
|
||||
"vflpsb %%v20, %%v20\n\t"
|
||||
"vflpsb %%v21, %%v21\n\t"
|
||||
"vflpsb %%v22, %%v22\n\t"
|
||||
"vflpsb %%v23, %%v23\n\t"
|
||||
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||
"vflpsb %%v16, %%v16\n\t"
|
||||
"vflpsb %%v17, %%v17\n\t"
|
||||
"vflpsb %%v18, %%v18\n\t"
|
||||
"vflpsb %%v19, %%v19\n\t"
|
||||
"vflpsb %%v20, %%v20\n\t"
|
||||
"vflpsb %%v21, %%v21\n\t"
|
||||
"vflpsb %%v22, %%v22\n\t"
|
||||
"vflpsb %%v23, %%v23\n\t"
|
||||
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||
"agfi %%r1,256\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||
"veslg %%v25,%%v24,32\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
: [asum] "=Q"(asum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ip = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return (sumf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = casum_kernel_32(n1, x);
|
||||
i = n1;
|
||||
ip = 2 * n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
||||
i++;
|
||||
ip += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
while (i < n) {
|
||||
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
||||
ip += inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return (sumf);
|
||||
}
|
||||
166
kernel/zarch/caxpy.c
Normal file
166
kernel/zarch/caxpy.c
Normal file
@@ -0,0 +1,166 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
|
||||
__asm__(
|
||||
#if !defined(CONJ)
|
||||
"vlrepf %%v0,0(%[alpha])\n\t"
|
||||
"vlef %%v1,4(%[alpha]),0\n\t"
|
||||
"vlef %%v1,4(%[alpha]),2\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,4(%[alpha]),1\n\t"
|
||||
"vlef %%v1,4(%[alpha]),3\n\t"
|
||||
#else
|
||||
"vlef %%v0,0(%[alpha]),1\n\t"
|
||||
"vlef %%v0,0(%[alpha]),3\n\t"
|
||||
"vflcsb %%v0,%%v0\n\t"
|
||||
"vlef %%v0,0(%[alpha]),0\n\t"
|
||||
"vlef %%v0,0(%[alpha]),2\n\t"
|
||||
"vlrepf %%v1,4(%[alpha])\n\t"
|
||||
#endif
|
||||
"srlg %[n],%[n],4\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||
"vl %%v8,0(%%r1,%[x])\n\t"
|
||||
"vl %%v9,16(%%r1,%[x])\n\t"
|
||||
"vl %%v10,32(%%r1,%[x])\n\t"
|
||||
"vl %%v11,48(%%r1,%[x])\n\t"
|
||||
"vl %%v12,0(%%r1,%[y])\n\t"
|
||||
"vl %%v13,16(%%r1,%[y])\n\t"
|
||||
"vl %%v14,32(%%r1,%[y])\n\t"
|
||||
"vl %%v15,48(%%r1,%[y])\n\t"
|
||||
"vl %%v16,64(%%r1,%[x])\n\t"
|
||||
"vl %%v17,80(%%r1,%[x])\n\t"
|
||||
"vl %%v18,96(%%r1,%[x])\n\t"
|
||||
"vl %%v19,112(%%r1,%[x])\n\t"
|
||||
"vl %%v20,64(%%r1,%[y])\n\t"
|
||||
"vl %%v21,80(%%r1,%[y])\n\t"
|
||||
"vl %%v22,96(%%r1,%[y])\n\t"
|
||||
"vl %%v23,112(%%r1,%[y])\n\t"
|
||||
"verllg %%v24,%%v8,32\n\t"
|
||||
"verllg %%v25,%%v9,32\n\t"
|
||||
"verllg %%v26,%%v10,32\n\t"
|
||||
"verllg %%v27,%%v11,32\n\t"
|
||||
"verllg %%v28,%%v16,32\n\t"
|
||||
"verllg %%v29,%%v17,32\n\t"
|
||||
"verllg %%v30,%%v18,32\n\t"
|
||||
"verllg %%v31,%%v19,32\n\t"
|
||||
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
|
||||
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
|
||||
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
|
||||
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
|
||||
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
|
||||
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
|
||||
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
|
||||
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
|
||||
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
|
||||
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
|
||||
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
|
||||
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
|
||||
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
|
||||
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
|
||||
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
|
||||
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
|
||||
"vst %%v8,0(%%r1,%[y])\n\t"
|
||||
"vst %%v9,16(%%r1,%[y])\n\t"
|
||||
"vst %%v10,32(%%r1,%[y])\n\t"
|
||||
"vst %%v11,48(%%r1,%[y])\n\t"
|
||||
"vst %%v16,64(%%r1,%[y])\n\t"
|
||||
"vst %%v17,80(%%r1,%[y])\n\t"
|
||||
"vst %%v18,96(%%r1,%[y])\n\t"
|
||||
"vst %%v19,112(%%r1,%[y])\n\t"
|
||||
"agfi %%r1,128\n\t"
|
||||
"brctg %[n],0b"
|
||||
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
|
||||
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT da[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (n <= 0)
|
||||
return (0);
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if (n1) {
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
caxpy_kernel_16(n1, x, y, da);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while (i < n) {
|
||||
#if !defined(CONJ)
|
||||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
i++;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
88
kernel/zarch/ccopy.c
Normal file
88
kernel/zarch/ccopy.c
Normal file
@@ -0,0 +1,88 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
__asm__("srlg %[n],%[n],5\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%[x])\n\t"
|
||||
"pfd 2, 1024(%[y])\n\t"
|
||||
"mvc 0(256,%[y]),0(%[x])\n\t"
|
||||
"la %[x],256(%[x])\n\t"
|
||||
"la %[y],256(%[y])\n\t"
|
||||
"brctg %[n],0b"
|
||||
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
|
||||
[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
|
||||
: "cc");
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
|
||||
if (n <= 0)
|
||||
return (0);
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
ccopy_kernel_32(n1, x, y);
|
||||
i = n1;
|
||||
ix = n1 * 2;
|
||||
iy = n1 * 2;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
y[iy] = x[iy];
|
||||
y[iy + 1] = x[ix + 1];
|
||||
ix += 2;
|
||||
iy += 2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while (i < n) {
|
||||
y[iy] = x[ix];
|
||||
y[iy + 1] = x[ix + 1];
|
||||
ix += inc_x2;
|
||||
iy += inc_y2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
176
kernel/zarch/cdot.c
Normal file
176
kernel/zarch/cdot.c
Normal file
@@ -0,0 +1,176 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||
__asm__("vzero %%v24\n\t"
|
||||
"vzero %%v25\n\t"
|
||||
"vzero %%v26\n\t"
|
||||
"vzero %%v27\n\t"
|
||||
"vzero %%v28\n\t"
|
||||
"vzero %%v29\n\t"
|
||||
"vzero %%v30\n\t"
|
||||
"vzero %%v31\n\t"
|
||||
"srlg %[n],%[n],4\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"pfd 1, 1024(%%r1,%[y])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||
"verllg %%v20,%%v16,32\n\t"
|
||||
"verllg %%v21,%%v17,32\n\t"
|
||||
"verllg %%v22,%%v18,32\n\t"
|
||||
"verllg %%v23,%%v19,32\n\t"
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
|
||||
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
|
||||
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
|
||||
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
|
||||
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
|
||||
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
|
||||
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
|
||||
"vl %%v16, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 112(%%r1,%[x])\n\t"
|
||||
"vl %%v0, 64(%%r1,%[y])\n\t"
|
||||
"vl %%v1, 80(%%r1,%[y])\n\t"
|
||||
"vl %%v2, 96(%%r1,%[y])\n\t"
|
||||
"vl %%v3, 112(%%r1,%[y])\n\t"
|
||||
"verllg %%v20,%%v16,32\n\t"
|
||||
"verllg %%v21,%%v17,32\n\t"
|
||||
"verllg %%v22,%%v18,32\n\t"
|
||||
"verllg %%v23,%%v19,32\n\t"
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
|
||||
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
|
||||
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
|
||||
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
|
||||
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
|
||||
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
|
||||
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
|
||||
"agfi %%r1,128\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||
"vrepg %%v26,%%v24,1\n\t"
|
||||
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||
"vfasb %%v25,%%v25,%%v27\n\t"
|
||||
"vfasb %%v25,%%v25,%%v29\n\t"
|
||||
"vfasb %%v25,%%v25,%%v31\n\t"
|
||||
"vrepg %%v27,%%v25,1\n\t"
|
||||
"vfasb %%v25,%%v25,%%v27\n\t"
|
||||
"vstef %%v24,0(%[d]),0\n\t"
|
||||
"vstef %%v24,4(%[d]),1\n\t"
|
||||
"vstef %%v25,8(%[d]),1\n\t"
|
||||
"vstef %%v25,12(%[d]),0"
|
||||
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
|
||||
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
}
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y) {
|
||||
BLASLONG i;
|
||||
BLASLONG ix, iy;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {
|
||||
0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
if (n <= 0) {
|
||||
CREAL(result) = 0.0;
|
||||
CIMAG(result) = 0.0;
|
||||
return (result);
|
||||
|
||||
}
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if (n1)
|
||||
cdot_kernel_16(n1, x, y, dot);
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[j] * y[j];
|
||||
dot[1] += x[j + 1] * y[j + 1];
|
||||
dot[2] += x[j] * y[j + 1];
|
||||
dot[3] += x[j + 1] * y[j];
|
||||
|
||||
j += 2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
i = 0;
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[ix] * y[iy];
|
||||
dot[1] += x[ix + 1] * y[iy + 1];
|
||||
dot[2] += x[ix] * y[iy + 1];
|
||||
dot[3] += x[ix + 1] * y[iy];
|
||||
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
CREAL(result) = dot[0] - dot[1];
|
||||
CIMAG(result) = dot[2] + dot[3];
|
||||
#else
|
||||
CREAL(result) = dot[0] + dot[1];
|
||||
CIMAG(result) = dot[2] - dot[3];
|
||||
|
||||
#endif
|
||||
|
||||
return (result);
|
||||
|
||||
}
|
||||
752
kernel/zarch/cgemv_n_4.c
Normal file
752
kernel/zarch/cgemv_n_4.c
Normal file
@@ -0,0 +1,752 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
|
||||
register FLOAT *ap0 = ap[0];
|
||||
register FLOAT *ap1 = ap[1];
|
||||
register FLOAT *ap2 = ap[2];
|
||||
register FLOAT *ap3 = ap[3];
|
||||
|
||||
__asm__("vlrepg %%v16,0(%[x])\n\t"
|
||||
"vlrepg %%v17,8(%[x])\n\t"
|
||||
"vlrepg %%v18,16(%[x])\n\t"
|
||||
"vlrepg %%v19,24(%[x])\n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v20,4(%[x]),0\n\t"
|
||||
"vlef %%v20,4(%[x]),2\n\t"
|
||||
"vflcsb %%v20,%%v20\n\t"
|
||||
"vlef %%v20,0(%[x]),1\n\t"
|
||||
"vlef %%v20,0(%[x]),3\n\t"
|
||||
"vlef %%v21,12(%[x]),0\n\t"
|
||||
"vlef %%v21,12(%[x]),2\n\t"
|
||||
"vflcsb %%v21,%%v21\n\t"
|
||||
"vlef %%v21,8(%[x]),1\n\t"
|
||||
"vlef %%v21,8(%[x]),3\n\t"
|
||||
"vlef %%v22,20(%[x]),0\n\t"
|
||||
"vlef %%v22,20(%[x]),2\n\t"
|
||||
"vflcsb %%v22,%%v22\n\t"
|
||||
"vlef %%v22,16(%[x]),1\n\t"
|
||||
"vlef %%v22,16(%[x]),3\n\t"
|
||||
"vlef %%v23,28(%[x]),0\n\t"
|
||||
"vlef %%v23,28(%[x]),2\n\t"
|
||||
"vflcsb %%v23,%%v23\n\t"
|
||||
"vlef %%v23,24(%[x]),1\n\t"
|
||||
"vlef %%v23,24(%[x]),3\n\t"
|
||||
#else
|
||||
"vlef %%v20,0(%[x]),1\n\t"
|
||||
"vlef %%v20,0(%[x]),3\n\t"
|
||||
"vflcsb %%v20,%%v20\n\t"
|
||||
"vlef %%v20,4(%[x]),0\n\t"
|
||||
"vlef %%v20,4(%[x]),2\n\t"
|
||||
"vlef %%v21,8(%[x]),1\n\t"
|
||||
"vlef %%v21,8(%[x]),3\n\t"
|
||||
"vflcsb %%v21,%%v21\n\t"
|
||||
"vlef %%v21,12(%[x]),0\n\t"
|
||||
"vlef %%v21,12(%[x]),2\n\t"
|
||||
"vlef %%v22,16(%[x]),1\n\t"
|
||||
"vlef %%v22,16(%[x]),3\n\t"
|
||||
"vflcsb %%v22,%%v22\n\t"
|
||||
"vlef %%v22,20(%[x]),0\n\t"
|
||||
"vlef %%v22,20(%[x]),2\n\t"
|
||||
"vlef %%v23,24(%[x]),1\n\t"
|
||||
"vlef %%v23,24(%[x]),3\n\t"
|
||||
"vflcsb %%v23,%%v23\n\t"
|
||||
"vlef %%v23,28(%[x]),0\n\t"
|
||||
"vlef %%v23,28(%[x]),2\n\t"
|
||||
#endif
|
||||
"vleib %%v1,0,0\n\t"
|
||||
"vleib %%v1,1,1\n\t"
|
||||
"vleib %%v1,2,2\n\t"
|
||||
"vleib %%v1,3,3\n\t"
|
||||
"vleib %%v1,0,4\n\t"
|
||||
"vleib %%v1,1,5\n\t"
|
||||
"vleib %%v1,2,6\n\t"
|
||||
"vleib %%v1,3,7\n\t"
|
||||
"vleib %%v1,8,8\n\t"
|
||||
"vleib %%v1,9,9\n\t"
|
||||
"vleib %%v1,10,10\n\t"
|
||||
"vleib %%v1,11,11\n\t"
|
||||
"vleib %%v1,8,12\n\t"
|
||||
"vleib %%v1,9,13\n\t"
|
||||
"vleib %%v1,10,14\n\t"
|
||||
"vleib %%v1,11,15\n\t"
|
||||
"vleib %%v2,4,0\n\t"
|
||||
"vleib %%v2,5,1\n\t"
|
||||
"vleib %%v2,6,2\n\t"
|
||||
"vleib %%v2,7,3\n\t"
|
||||
"vleib %%v2,4,4\n\t"
|
||||
"vleib %%v2,5,5\n\t"
|
||||
"vleib %%v2,6,6\n\t"
|
||||
"vleib %%v2,7,7\n\t"
|
||||
"vleib %%v2,12,8\n\t"
|
||||
"vleib %%v2,13,9\n\t"
|
||||
"vleib %%v2,14,10\n\t"
|
||||
"vleib %%v2,15,11\n\t"
|
||||
"vleib %%v2,12,12\n\t"
|
||||
"vleib %%v2,13,13\n\t"
|
||||
"vleib %%v2,14,14\n\t"
|
||||
"vleib %%v2,15,15\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||
"vl %%v24,0(%%r1,%[ap0])\n\t"
|
||||
"vperm %%v25,%%v24,%%v24,%%v2\n\t"
|
||||
"vperm %%v24,%%v24,%%v24,%%v1\n\t"
|
||||
"vl %%v26,0(%%r1,%[ap1])\n\t"
|
||||
"vperm %%v27,%%v26,%%v26,%%v2\n\t"
|
||||
"vperm %%v26,%%v26,%%v26,%%v1\n\t"
|
||||
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
|
||||
"vl %%v28,0(%%r1,%[ap2])\n\t"
|
||||
"vperm %%v29,%%v28,%%v28,%%v2\n\t"
|
||||
"vperm %%v28,%%v28,%%v28,%%v1\n\t"
|
||||
"vl %%v30,0(%%r1,%[ap3])\n\t"
|
||||
"vperm %%v31,%%v30,%%v30,%%v2\n\t"
|
||||
"vperm %%v30,%%v30,%%v30,%%v1\n\t"
|
||||
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
|
||||
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||
"agfi %%r1,16\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
|
||||
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
|
||||
register FLOAT *ap0 = ap[0];
|
||||
register FLOAT *ap1 = ap[1];
|
||||
|
||||
__asm__("vlrepg %%v16,0(%[x])\n\t"
|
||||
"vlrepg %%v17,8(%[x])\n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v18,4(%[x]),0\n\t"
|
||||
"vlef %%v18,4(%[x]),2\n\t"
|
||||
"vflcsb %%v18,%%v18\n\t"
|
||||
"vlef %%v18,0(%[x]),1\n\t"
|
||||
"vlef %%v18,0(%[x]),3\n\t"
|
||||
"vlef %%v19,12(%[x]),0\n\t"
|
||||
"vlef %%v19,12(%[x]),2\n\t"
|
||||
"vflcsb %%v19,%%v19\n\t"
|
||||
"vlef %%v19,8(%[x]),1\n\t"
|
||||
"vlef %%v19,8(%[x]),3\n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%[x]),1\n\t"
|
||||
"vlef %%v18,0(%[x]),3\n\t"
|
||||
"vflcsb %%v18,%%v18\n\t"
|
||||
"vlef %%v18,4(%[x]),0\n\t"
|
||||
"vlef %%v18,4(%[x]),2\n\t"
|
||||
"vlef %%v19,8(%[x]),1\n\t"
|
||||
"vlef %%v19,8(%[x]),3\n\t"
|
||||
"vflcsb %%v19,%%v19\n\t"
|
||||
"vlef %%v19,12(%[x]),0\n\t"
|
||||
"vlef %%v19,12(%[x]),2\n\t"
|
||||
#endif
|
||||
"vleib %%v1,0,0\n\t"
|
||||
"vleib %%v1,1,1\n\t"
|
||||
"vleib %%v1,2,2\n\t"
|
||||
"vleib %%v1,3,3\n\t"
|
||||
"vleib %%v1,0,4\n\t"
|
||||
"vleib %%v1,1,5\n\t"
|
||||
"vleib %%v1,2,6\n\t"
|
||||
"vleib %%v1,3,7\n\t"
|
||||
"vleib %%v1,8,8\n\t"
|
||||
"vleib %%v1,9,9\n\t"
|
||||
"vleib %%v1,10,10\n\t"
|
||||
"vleib %%v1,11,11\n\t"
|
||||
"vleib %%v1,8,12\n\t"
|
||||
"vleib %%v1,9,13\n\t"
|
||||
"vleib %%v1,10,14\n\t"
|
||||
"vleib %%v1,11,15\n\t"
|
||||
"vleib %%v2,4,0\n\t"
|
||||
"vleib %%v2,5,1\n\t"
|
||||
"vleib %%v2,6,2\n\t"
|
||||
"vleib %%v2,7,3\n\t"
|
||||
"vleib %%v2,4,4\n\t"
|
||||
"vleib %%v2,5,5\n\t"
|
||||
"vleib %%v2,6,6\n\t"
|
||||
"vleib %%v2,7,7\n\t"
|
||||
"vleib %%v2,12,8\n\t"
|
||||
"vleib %%v2,13,9\n\t"
|
||||
"vleib %%v2,14,10\n\t"
|
||||
"vleib %%v2,15,11\n\t"
|
||||
"vleib %%v2,12,12\n\t"
|
||||
"vleib %%v2,13,13\n\t"
|
||||
"vleib %%v2,14,14\n\t"
|
||||
"vleib %%v2,15,15\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||
"vl %%v20,0(%%r1,%[ap0])\n\t"
|
||||
"vperm %%v21,%%v20,%%v20,%%v2\n\t"
|
||||
"vperm %%v20,%%v20,%%v20,%%v1\n\t"
|
||||
"vl %%v22,0(%%r1,%[ap1])\n\t"
|
||||
"vperm %%v23,%%v22,%%v22,%%v2\n\t"
|
||||
"vperm %%v22,%%v22,%%v22,%%v1\n\t"
|
||||
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
|
||||
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||
"agfi %%r1,16\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23");
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
__asm__("vlrepg %%v16,0(%[x])\n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v17,4(%[x]),0\n\t"
|
||||
"vlef %%v17,4(%[x]),2\n\t"
|
||||
"vflcsb %%v17,%%v17\n\t"
|
||||
"vlef %%v17,0(%[x]),1\n\t"
|
||||
"vlef %%v17,0(%[x]),3\n\t"
|
||||
#else
|
||||
"vlef %%v17,0(%[x]),1\n\t"
|
||||
"vlef %%v17,0(%[x]),3\n\t"
|
||||
"vflcsb %%v17,%%v17\n\t"
|
||||
"vlef %%v17,4(%[x]),0\n\t"
|
||||
"vlef %%v17,4(%[x]),2\n\t"
|
||||
#endif
|
||||
"vleib %%v1,0,0\n\t"
|
||||
"vleib %%v1,1,1\n\t"
|
||||
"vleib %%v1,2,2\n\t"
|
||||
"vleib %%v1,3,3\n\t"
|
||||
"vleib %%v1,0,4\n\t"
|
||||
"vleib %%v1,1,5\n\t"
|
||||
"vleib %%v1,2,6\n\t"
|
||||
"vleib %%v1,3,7\n\t"
|
||||
"vleib %%v1,8,8\n\t"
|
||||
"vleib %%v1,9,9\n\t"
|
||||
"vleib %%v1,10,10\n\t"
|
||||
"vleib %%v1,11,11\n\t"
|
||||
"vleib %%v1,8,12\n\t"
|
||||
"vleib %%v1,9,13\n\t"
|
||||
"vleib %%v1,10,14\n\t"
|
||||
"vleib %%v1,11,15\n\t"
|
||||
"vleib %%v2,4,0\n\t"
|
||||
"vleib %%v2,5,1\n\t"
|
||||
"vleib %%v2,6,2\n\t"
|
||||
"vleib %%v2,7,3\n\t"
|
||||
"vleib %%v2,4,4\n\t"
|
||||
"vleib %%v2,5,5\n\t"
|
||||
"vleib %%v2,6,6\n\t"
|
||||
"vleib %%v2,7,7\n\t"
|
||||
"vleib %%v2,12,8\n\t"
|
||||
"vleib %%v2,13,9\n\t"
|
||||
"vleib %%v2,14,10\n\t"
|
||||
"vleib %%v2,15,11\n\t"
|
||||
"vleib %%v2,12,12\n\t"
|
||||
"vleib %%v2,13,13\n\t"
|
||||
"vleib %%v2,14,14\n\t"
|
||||
"vleib %%v2,15,15\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap])\n\t"
|
||||
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||
"vl %%v18,0(%%r1,%[ap])\n\t"
|
||||
"vperm %%v19,%%v18,%%v18,%%v2\n\t"
|
||||
"vperm %%v18,%%v18,%%v18,%%v1\n\t"
|
||||
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
|
||||
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||
"agfi %%r1,16\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
|
||||
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
|
||||
}
|
||||
|
||||
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
|
||||
FLOAT alpha_i) {
|
||||
__asm__(
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v0,%[alpha_r]\n\t"
|
||||
"vlef %%v1,%[alpha_i],0\n\t"
|
||||
"vlef %%v1,%[alpha_i],2\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,%[alpha_i],1\n\t"
|
||||
"vlef %%v1,%[alpha_i],3\n\t"
|
||||
#else
|
||||
"vlef %%v0,%[alpha_r],1\n\t"
|
||||
"vlef %%v0,%[alpha_r],3\n\t"
|
||||
"vflcsb %%v0,%%v0\n\t"
|
||||
"vlef %%v0,%[alpha_r],0\n\t"
|
||||
"vlef %%v0,%[alpha_r],2\n\t"
|
||||
"vlrepf %%v1,%[alpha_i]\n\t"
|
||||
#endif
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],2\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[src])\n\t"
|
||||
"pfd 2,1024(%%r1,%[dest])\n\t"
|
||||
"vl %%v16,0(%%r1,%[src])\n\t"
|
||||
"vl %%v17,16(%%r1,%[src])\n\t"
|
||||
"vl %%v18,0(%%r1,%[dest])\n\t"
|
||||
"vl %%v19,16(%%r1,%[dest])\n\t"
|
||||
"verllg %%v20,%%v16,32\n\t"
|
||||
"verllg %%v21,%%v17,32\n\t"
|
||||
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
|
||||
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
|
||||
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
|
||||
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
|
||||
"vst %%v22,0(%%r1,%[dest])\n\t"
|
||||
"vst %%v23,16(%%r1,%[dest])\n\t"
|
||||
"agfi %%r1,32\n\t"
|
||||
"brctg %[n],0b"
|
||||
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
|
||||
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
|
||||
[src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
|
||||
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||
"v22", "v23");
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
|
||||
FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
if (inc_dest != 2) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for (i = 0; i < n; i++) {
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest + 1) += temp_i;
|
||||
|
||||
src += 2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
add_y_4(n, src, dest, alpha_r, alpha_i);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4;
|
||||
FLOAT xbuffer[8], *ybuffer;
|
||||
|
||||
if (m < 1)
|
||||
return (0);
|
||||
if (n < 1)
|
||||
return (0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
lda *= 2;
|
||||
lda4 = 4 * lda;
|
||||
|
||||
n1 = n / 4;
|
||||
n2 = n % 4;
|
||||
|
||||
m3 = m % 4;
|
||||
m1 = m - (m % 4);
|
||||
m2 = (m % NBMAX) - (m % 4);
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0)
|
||||
break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
x_ptr = x;
|
||||
//zero_y(NB,ybuffer);
|
||||
memset(ybuffer, 0, NB * 8);
|
||||
|
||||
if (inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
|
||||
x_ptr += 4;
|
||||
a_ptr += 2 * lda;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
||||
/* x_ptr += 2;
|
||||
a_ptr += lda; */
|
||||
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
xbuffer[3] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[4] = x_ptr[0];
|
||||
xbuffer[5] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[6] = x_ptr[0];
|
||||
xbuffer[7] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
|
||||
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
|
||||
a_ptr += 1 * lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
|
||||
a += 2 * NB;
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
|
||||
if (m3 == 0)
|
||||
return (0);
|
||||
|
||||
if (m3 == 1) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r = 0.0;
|
||||
FLOAT temp_i = 0.0;
|
||||
|
||||
if (lda == 2 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
|
||||
if (lda == 4 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
|
||||
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
|
||||
|
||||
#endif
|
||||
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 3) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
FLOAT temp_r2 = 0.0;
|
||||
FLOAT temp_i2 = 0.0;
|
||||
|
||||
if (lda == 6 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 6;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
724
kernel/zarch/cgemv_t_4.c
Normal file
724
kernel/zarch/cgemv_t_4.c
Normal file
@@ -0,0 +1,724 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||
FLOAT *alpha) {
|
||||
register FLOAT *ap0 = ap[0];
|
||||
register FLOAT *ap1 = ap[1];
|
||||
register FLOAT *ap2 = ap[2];
|
||||
register FLOAT *ap3 = ap[3];
|
||||
|
||||
__asm__("vzero %%v16\n\t"
|
||||
"vzero %%v17\n\t"
|
||||
"vzero %%v18\n\t"
|
||||
"vzero %%v19\n\t"
|
||||
"vzero %%v20\n\t"
|
||||
"vzero %%v21\n\t"
|
||||
"vzero %%v22\n\t"
|
||||
"vzero %%v23\n\t"
|
||||
"vleib %%v2,0,0\n\t"
|
||||
"vleib %%v2,1,1\n\t"
|
||||
"vleib %%v2,2,2\n\t"
|
||||
"vleib %%v2,3,3\n\t"
|
||||
"vleib %%v2,0,4\n\t"
|
||||
"vleib %%v2,1,5\n\t"
|
||||
"vleib %%v2,2,6\n\t"
|
||||
"vleib %%v2,3,7\n\t"
|
||||
"vleib %%v2,8,8\n\t"
|
||||
"vleib %%v2,9,9\n\t"
|
||||
"vleib %%v2,10,10\n\t"
|
||||
"vleib %%v2,11,11\n\t"
|
||||
"vleib %%v2,8,12\n\t"
|
||||
"vleib %%v2,9,13\n\t"
|
||||
"vleib %%v2,10,14\n\t"
|
||||
"vleib %%v2,11,15\n\t"
|
||||
"vleib %%v3,4,0\n\t"
|
||||
"vleib %%v3,5,1\n\t"
|
||||
"vleib %%v3,6,2\n\t"
|
||||
"vleib %%v3,7,3\n\t"
|
||||
"vleib %%v3,4,4\n\t"
|
||||
"vleib %%v3,5,5\n\t"
|
||||
"vleib %%v3,6,6\n\t"
|
||||
"vleib %%v3,7,7\n\t"
|
||||
"vleib %%v3,12,8\n\t"
|
||||
"vleib %%v3,13,9\n\t"
|
||||
"vleib %%v3,14,10\n\t"
|
||||
"vleib %%v3,15,11\n\t"
|
||||
"vleib %%v3,12,12\n\t"
|
||||
"vleib %%v3,13,13\n\t"
|
||||
"vleib %%v3,14,14\n\t"
|
||||
"vleib %%v3,15,15\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||
#else
|
||||
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||
#endif
|
||||
"vl %%v24,0(%%r1,%[ap0])\n\t"
|
||||
"vperm %%v25,%%v24,%%v24,%%v3\n\t"
|
||||
"vperm %%v24,%%v24,%%v24,%%v2\n\t"
|
||||
"vl %%v26,0(%%r1,%[ap1])\n\t"
|
||||
"vperm %%v27,%%v26,%%v26,%%v3\n\t"
|
||||
"vperm %%v26,%%v26,%%v26,%%v2\n\t"
|
||||
"vl %%v28,0(%%r1,%[ap2])\n\t"
|
||||
"vperm %%v29,%%v28,%%v28,%%v3\n\t"
|
||||
"vperm %%v28,%%v28,%%v28,%%v2\n\t"
|
||||
"vl %%v30,0(%%r1,%[ap3])\n\t"
|
||||
"vperm %%v31,%%v30,%%v30,%%v3\n\t"
|
||||
"vperm %%v30,%%v30,%%v30,%%v2\n\t"
|
||||
"vfmasb %%v16,%%v24,%%v0,%%v16\n\t"
|
||||
"vfmasb %%v20,%%v25,%%v1,%%v20\n\t"
|
||||
"vfmasb %%v17,%%v26,%%v0,%%v17\n\t"
|
||||
"vfmasb %%v21,%%v27,%%v1,%%v21\n\t"
|
||||
"vfmasb %%v18,%%v28,%%v0,%%v18\n\t"
|
||||
"vfmasb %%v22,%%v29,%%v1,%%v22\n\t"
|
||||
"vfmasb %%v19,%%v30,%%v0,%%v19\n\t"
|
||||
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
|
||||
"agfi %%r1,16\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v16,%%v16,%%v20\n\t"
|
||||
"vfasb %%v17,%%v17,%%v21\n\t"
|
||||
"vfasb %%v18,%%v18,%%v22\n\t"
|
||||
"vfasb %%v19,%%v19,%%v23\n\t"
|
||||
"vrepg %%v20,%%v16,1\n\t"
|
||||
"vrepg %%v21,%%v17,1\n\t"
|
||||
"vrepg %%v22,%%v18,1\n\t"
|
||||
"vrepg %%v23,%%v19,1\n\t"
|
||||
"vfasb %%v16,%%v16,%%v20\n\t"
|
||||
"vfasb %%v17,%%v17,%%v21\n\t"
|
||||
"vfasb %%v18,%%v18,%%v22\n\t"
|
||||
"vfasb %%v19,%%v19,%%v23\n\t"
|
||||
"vmrhg %%v16,%%v16,%%v17\n\t"
|
||||
"vmrhg %%v17,%%v18,%%v19\n\t"
|
||||
"verllg %%v18,%%v16,32\n\t"
|
||||
"verllg %%v19,%%v17,32\n\t"
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v20,0(%[alpha])\n\t"
|
||||
"vlef %%v21,4(%[alpha]),0\n\t"
|
||||
"vlef %%v21,4(%[alpha]),2\n\t"
|
||||
"vflcsb %%v21,%%v21\n\t"
|
||||
"vlef %%v21,4(%[alpha]),1\n\t"
|
||||
"vlef %%v21,4(%[alpha]),3\n\t"
|
||||
#else
|
||||
"vlef %%v20,0(%[alpha]),1\n\t"
|
||||
"vlef %%v20,0(%[alpha]),3\n\t"
|
||||
"vflcsb %%v20,%%v20\n\t"
|
||||
"vlef %%v20,0(%[alpha]),0\n\t"
|
||||
"vlef %%v20,0(%[alpha]),2\n\t"
|
||||
"vlrepf %%v21,4(%[alpha])\n\t"
|
||||
#endif
|
||||
"vl %%v22,0(%[y])\n\t"
|
||||
"vl %%v23,16(%[y])\n\t"
|
||||
"vfmasb %%v22,%%v16,%%v20,%%v22\n\t"
|
||||
"vfmasb %%v22,%%v18,%%v21,%%v22\n\t"
|
||||
"vfmasb %%v23,%%v17,%%v20,%%v23\n\t"
|
||||
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
|
||||
"vst %%v22,0(%[y])\n\t"
|
||||
"vst %%v23,16(%[y])"
|
||||
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||
FLOAT *alpha) {
|
||||
register FLOAT *ap0 = ap[0];
|
||||
register FLOAT *ap1 = ap[1];
|
||||
|
||||
__asm__("vzero %%v16\n\t"
|
||||
"vzero %%v17\n\t"
|
||||
"vzero %%v18\n\t"
|
||||
"vzero %%v19\n\t"
|
||||
"vleib %%v2,0,0\n\t"
|
||||
"vleib %%v2,1,1\n\t"
|
||||
"vleib %%v2,2,2\n\t"
|
||||
"vleib %%v2,3,3\n\t"
|
||||
"vleib %%v2,0,4\n\t"
|
||||
"vleib %%v2,1,5\n\t"
|
||||
"vleib %%v2,2,6\n\t"
|
||||
"vleib %%v2,3,7\n\t"
|
||||
"vleib %%v2,8,8\n\t"
|
||||
"vleib %%v2,9,9\n\t"
|
||||
"vleib %%v2,10,10\n\t"
|
||||
"vleib %%v2,11,11\n\t"
|
||||
"vleib %%v2,8,12\n\t"
|
||||
"vleib %%v2,9,13\n\t"
|
||||
"vleib %%v2,10,14\n\t"
|
||||
"vleib %%v2,11,15\n\t"
|
||||
"vleib %%v3,4,0\n\t"
|
||||
"vleib %%v3,5,1\n\t"
|
||||
"vleib %%v3,6,2\n\t"
|
||||
"vleib %%v3,7,3\n\t"
|
||||
"vleib %%v3,4,4\n\t"
|
||||
"vleib %%v3,5,5\n\t"
|
||||
"vleib %%v3,6,6\n\t"
|
||||
"vleib %%v3,7,7\n\t"
|
||||
"vleib %%v3,12,8\n\t"
|
||||
"vleib %%v3,13,9\n\t"
|
||||
"vleib %%v3,14,10\n\t"
|
||||
"vleib %%v3,15,11\n\t"
|
||||
"vleib %%v3,12,12\n\t"
|
||||
"vleib %%v3,13,13\n\t"
|
||||
"vleib %%v3,14,14\n\t"
|
||||
"vleib %%v3,15,15\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||
#else
|
||||
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||
#endif
|
||||
"vl %%v20,0(%%r1,%[ap0])\n\t"
|
||||
"vperm %%v21,%%v20,%%v20,%%v3\n\t"
|
||||
"vperm %%v20,%%v20,%%v20,%%v2\n\t"
|
||||
"vl %%v22,0(%%r1,%[ap1])\n\t"
|
||||
"vperm %%v23,%%v22,%%v22,%%v3\n\t"
|
||||
"vperm %%v22,%%v22,%%v22,%%v2\n\t"
|
||||
"vfmasb %%v16,%%v20,%%v0,%%v16\n\t"
|
||||
"vfmasb %%v18,%%v21,%%v1,%%v18\n\t"
|
||||
"vfmasb %%v17,%%v22,%%v0,%%v17\n\t"
|
||||
"vfmasb %%v19,%%v23,%%v1,%%v19\n\t"
|
||||
"agfi %%r1,16\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v16,%%v16,%%v18\n\t"
|
||||
"vfasb %%v17,%%v17,%%v19\n\t"
|
||||
"vrepg %%v18,%%v16,1\n\t"
|
||||
"vrepg %%v19,%%v17,1\n\t"
|
||||
"vfasb %%v16,%%v16,%%v18\n\t"
|
||||
"vfasb %%v17,%%v17,%%v19\n\t"
|
||||
"vmrhg %%v16,%%v16,%%v17\n\t"
|
||||
"verllg %%v17,%%v16,32\n\t"
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v18,0(%[alpha])\n\t"
|
||||
"vlef %%v19,4(%[alpha]),0\n\t"
|
||||
"vlef %%v19,4(%[alpha]),2\n\t"
|
||||
"vflcsb %%v19,%%v19\n\t"
|
||||
"vlef %%v19,4(%[alpha]),1\n\t"
|
||||
"vlef %%v19,4(%[alpha]),3\n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%[alpha]),1\n\t"
|
||||
"vlef %%v18,0(%[alpha]),3\n\t"
|
||||
"vflcsb %%v18,%%v18\n\t"
|
||||
"vlef %%v18,0(%[alpha]),0\n\t"
|
||||
"vlef %%v18,0(%[alpha]),2\n\t"
|
||||
"vlrepf %%v19,4(%[alpha])\n\t"
|
||||
#endif
|
||||
"vl %%v20,0(%[y])\n\t"
|
||||
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
|
||||
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
|
||||
"vst %%v20,0(%[y])"
|
||||
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||
"v21", "v22", "v23");
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
|
||||
FLOAT *alpha) {
|
||||
__asm__("vzero %%v16\n\t"
|
||||
"vzero %%v17\n\t"
|
||||
"vleib %%v2,0,0\n\t"
|
||||
"vleib %%v2,1,1\n\t"
|
||||
"vleib %%v2,2,2\n\t"
|
||||
"vleib %%v2,3,3\n\t"
|
||||
"vleib %%v2,0,4\n\t"
|
||||
"vleib %%v2,1,5\n\t"
|
||||
"vleib %%v2,2,6\n\t"
|
||||
"vleib %%v2,3,7\n\t"
|
||||
"vleib %%v2,8,8\n\t"
|
||||
"vleib %%v2,9,9\n\t"
|
||||
"vleib %%v2,10,10\n\t"
|
||||
"vleib %%v2,11,11\n\t"
|
||||
"vleib %%v2,8,12\n\t"
|
||||
"vleib %%v2,9,13\n\t"
|
||||
"vleib %%v2,10,14\n\t"
|
||||
"vleib %%v2,11,15\n\t"
|
||||
"vleib %%v3,4,0\n\t"
|
||||
"vleib %%v3,5,1\n\t"
|
||||
"vleib %%v3,6,2\n\t"
|
||||
"vleib %%v3,7,3\n\t"
|
||||
"vleib %%v3,4,4\n\t"
|
||||
"vleib %%v3,5,5\n\t"
|
||||
"vleib %%v3,6,6\n\t"
|
||||
"vleib %%v3,7,7\n\t"
|
||||
"vleib %%v3,12,8\n\t"
|
||||
"vleib %%v3,13,9\n\t"
|
||||
"vleib %%v3,14,10\n\t"
|
||||
"vleib %%v3,15,11\n\t"
|
||||
"vleib %%v3,12,12\n\t"
|
||||
"vleib %%v3,13,13\n\t"
|
||||
"vleib %%v3,14,14\n\t"
|
||||
"vleib %%v3,15,15\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"srlg %[n],%[n],1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1,1024(%%r1,%[ap])\n\t"
|
||||
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||
#else
|
||||
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||
"vflcsb %%v1,%%v1\n\t"
|
||||
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||
#endif
|
||||
"vl %%v18,0(%%r1,%[ap])\n\t"
|
||||
"vperm %%v19,%%v18,%%v18,%%v3\n\t"
|
||||
"vperm %%v18,%%v18,%%v18,%%v2\n\t"
|
||||
"vfmasb %%v16,%%v18,%%v0,%%v16\n\t"
|
||||
"vfmasb %%v17,%%v19,%%v1,%%v17\n\t"
|
||||
"agfi %%r1,16\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||
"vrepg %%v17,%%v16,1\n\t"
|
||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||
"verllg %%v17,%%v16,32\n\t"
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v18,0(%[alpha])\n\t"
|
||||
"vlef %%v19,4(%[alpha]),0\n\t"
|
||||
"vflcsb %%v19,%%v19\n\t"
|
||||
"vlef %%v19,4(%[alpha]),1\n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%[alpha]),1\n\t"
|
||||
"vflcsb %%v18,%%v18\n\t"
|
||||
"vlef %%v18,0(%[alpha]),0\n\t"
|
||||
"vlrepf %%v19,4(%[alpha])\n\t"
|
||||
#endif
|
||||
"vleg %%v0,0(%[y]),0\n\t"
|
||||
"vfmasb %%v0,%%v16,%%v18,%%v0\n\t"
|
||||
"vfmasb %%v0,%%v17,%%v19,%%v0\n\t"
|
||||
"vsteg %%v0,0(%[y]),0"
|
||||
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
|
||||
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
|
||||
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest = *src;
|
||||
*(dest + 1) = *(src + 1);
|
||||
dest += 2;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[8];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4;
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
FLOAT alpha[2];
|
||||
|
||||
if (m < 1)
|
||||
return (0);
|
||||
if (n < 1)
|
||||
return (0);
|
||||
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
lda <<= 1;
|
||||
lda4 = lda << 2;
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 2;
|
||||
n2 = n & 3;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
alpha[0] = alpha_r;
|
||||
alpha[1] = alpha_i;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0)
|
||||
break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
if (inc_x != 2)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
if (inc_y == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
y_ptr += 8;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
|
||||
a_ptr += lda * 2;
|
||||
y_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||
/* a_ptr += lda;
|
||||
y_ptr += 2; */
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
memset(ybuffer, 0, sizeof(ybuffer));
|
||||
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[2];
|
||||
y_ptr[1] += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[4];
|
||||
y_ptr[1] += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[6];
|
||||
y_ptr[1] += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
memset(ybuffer, 0, sizeof(ybuffer));
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
|
||||
a_ptr += lda;
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
a += 2 * NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if (m3 == 0)
|
||||
return (0);
|
||||
|
||||
x_ptr = x;
|
||||
j = 0;
|
||||
a_ptr = a;
|
||||
y_ptr = y;
|
||||
|
||||
if (m3 == 3) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x4 = x_ptr[0];
|
||||
FLOAT x5 = x_ptr[1];
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
FLOAT ar = alpha[0];
|
||||
FLOAT ai = alpha[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
|
||||
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
|
||||
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 1) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
FLOAT ar = alpha[0];
|
||||
FLOAT ai = alpha[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
|
||||
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
|
||||
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
236
kernel/zarch/crot.c
Normal file
236
kernel/zarch/crot.c
Normal file
@@ -0,0 +1,236 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
|
||||
__asm__("vlrepf %%v0,%[c]\n\t"
|
||||
"vlrepf %%v1,%[s]\n\t"
|
||||
"srlg %[n],%[n],5\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||
"vl %%v24, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v25, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v26, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v27, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[y])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[y])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[y])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[y])\n\t"
|
||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0\n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0\n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
|
||||
/* 2nd parts */
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||
"vst %%v28, 0(%%r1,%[x])\n\t"
|
||||
"vst %%v29, 16(%%r1,%[x])\n\t"
|
||||
"vst %%v30, 32(%%r1,%[x])\n\t"
|
||||
"vst %%v31, 48(%%r1,%[x])\n\t"
|
||||
"vst %%v20, 0(%%r1,%[y])\n\t"
|
||||
"vst %%v21, 16(%%r1,%[y])\n\t"
|
||||
"vst %%v22, 32(%%r1,%[y])\n\t"
|
||||
"vst %%v23, 48(%%r1,%[y])\n\t"
|
||||
"vl %%v24, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v25, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v26, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v27, 112(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 64(%%r1,%[y])\n\t"
|
||||
"vl %%v17, 80(%%r1,%[y])\n\t"
|
||||
"vl %%v18, 96(%%r1,%[y])\n\t"
|
||||
"vl %%v19, 112(%%r1,%[y])\n\t"
|
||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0\n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0\n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
|
||||
/* 2nd parts */
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||
"vst %%v28, 64(%%r1,%[x])\n\t"
|
||||
"vst %%v29, 80(%%r1,%[x])\n\t"
|
||||
"vst %%v30, 96(%%r1,%[x])\n\t"
|
||||
"vst %%v31, 112(%%r1,%[x])\n\t"
|
||||
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 128(%%r1,%[y])\n\t"
|
||||
"vl %%v17, 144(%%r1,%[y])\n\t"
|
||||
"vl %%v18, 160(%%r1,%[y])\n\t"
|
||||
"vl %%v19, 176(%%r1,%[y])\n\t"
|
||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0\n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0\n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
|
||||
/* 2nd parts */
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||
"vst %%v28, 128(%%r1,%[x])\n\t"
|
||||
"vst %%v29, 144(%%r1,%[x])\n\t"
|
||||
"vst %%v30, 160(%%r1,%[x])\n\t"
|
||||
"vst %%v31, 176(%%r1,%[x])\n\t"
|
||||
"vst %%v20, 128(%%r1,%[y])\n\t"
|
||||
"vst %%v21, 144(%%r1,%[y])\n\t"
|
||||
"vst %%v22, 160(%%r1,%[y])\n\t"
|
||||
"vst %%v23, 176(%%r1,%[y])\n\t"
|
||||
"vl %%v24, 192(%%r1,%[x])\n\t"
|
||||
"vl %%v25, 208(%%r1,%[x])\n\t"
|
||||
"vl %%v26, 224(%%r1,%[x])\n\t"
|
||||
"vl %%v27, 240(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 192(%%r1,%[y])\n\t"
|
||||
"vl %%v17, 208(%%r1,%[y])\n\t"
|
||||
"vl %%v18, 224(%%r1,%[y])\n\t"
|
||||
"vl %%v19, 240(%%r1,%[y])\n\t"
|
||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0\n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0\n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
|
||||
/* 2nd parts */
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||
"vst %%v28, 192(%%r1,%[x])\n\t"
|
||||
"vst %%v29, 208(%%r1,%[x])\n\t"
|
||||
"vst %%v30, 224(%%r1,%[x])\n\t"
|
||||
"vst %%v31, 240(%%r1,%[x])\n\t"
|
||||
"vst %%v20, 192(%%r1,%[y])\n\t"
|
||||
"vst %%v21, 208(%%r1,%[y])\n\t"
|
||||
"vst %%v22, 224(%%r1,%[y])\n\t"
|
||||
"vst %%v23, 240(%%r1,%[y])\n\t"
|
||||
"agfi %%r1,256\n\t"
|
||||
"brctg %[n],0b"
|
||||
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
|
||||
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
|
||||
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT c, FLOAT s) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if (n <= 0)
|
||||
return (0);
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
FLOAT cosa, sina;
|
||||
cosa = c;
|
||||
sina = s;
|
||||
crot_kernel_32(n1, x, y, &cosa, &sina);
|
||||
i = n1;
|
||||
ix = 2 * n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
temp[0] = c * x[ix] + s * y[ix];
|
||||
temp[1] = c * x[ix + 1] + s * y[ix + 1];
|
||||
y[ix] = c * y[ix] - s * x[ix];
|
||||
y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
|
||||
x[ix] = temp[0];
|
||||
x[ix + 1] = temp[1];
|
||||
|
||||
ix += 2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
while (i < n) {
|
||||
temp[0] = c * x[ix] + s * y[iy];
|
||||
temp[1] = c * x[ix + 1] + s * y[iy + 1];
|
||||
y[iy] = c * y[iy] - s * x[ix];
|
||||
y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
|
||||
x[ix] = temp[0];
|
||||
x[ix + 1] = temp[1];
|
||||
|
||||
ix += inc_x2;
|
||||
iy += inc_y2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user