Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop

This commit is contained in:
AbdelRauf
2019-04-29 08:57:44 +00:00
197 changed files with 17904 additions and 7444 deletions

View File

@@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
ifeq ($(TARGET), GENERIC)
ifeq ($(CORE), GENERIC)
USE_TRMM = 1
endif
@@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch)
USE_TRMM = 1
endif
ifeq ($(CORE), Z14)
USE_TRMM = 1
endif

View File

@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
if( x[ix] > minf )
if( x[ix] < minf )
{
min = i;
minf = x[ix];

175
kernel/arm64/KERNEL.TSV110 Normal file
View File

@@ -0,0 +1,175 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
if( x[ix] > minf )
if( x[ix] < minf )
{
min = i;
minf = x[ix];

View File

@@ -129,7 +129,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1)
dcbst PRE, CO1
dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4

View File

@@ -134,7 +134,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1)
dcbst PRE, CO1
dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4

View File

@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", // "0", "1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"movsd %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4

View File

@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",

View File

@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
"jnz 1b \n\t"
@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movsd %%xmm11,8(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movsd %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4

View File

@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
"jnz 1b \n\t"
:
"+r" (n) // 0
:
"r" (n), // 0
"r" (x), // 1
"r" (x1), // 2
"r" (alpha), // 3

View File

@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",

View File

@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n1), // 0
"+r" (x) // 1
:
:
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2
"r" (n2) // 3
: "cc",

View File

@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n1), // 0
"+r" (x) // 1
:
:
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2
"r" (n2) // 3
: "cc",

View File

@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t"
" je 4f \n\t"
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t"
"1: \n\t"
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
" vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 22f \n\t"
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
" vmovups (%9), %%ymm0 \n\t"
" vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
" vmovups 32(%9), %%ymm4 \n\t"
" vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"5: \n\t" // i = 0
" addq $64, %9 \n\t" // b=b+8
" addq $64, %3 \n\t" // b=b+8
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups %%ymm8 , (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm9 , (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm10, (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm11, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm12, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm13, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm14, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
" addq $32, %8 \n\t" // a=a+8
" addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
" vmovups %%ymm15, (%8) \n\t" // write a
" vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c3), // 5
"r" (c6), // 6
"r" (ldc), // 7
"r" (as), // 8
"r" (bs) // 9
"r" (a), // 8
"r" (b) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" prefetcht0 384(%3,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" prefetcht0 384(%7,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 1
" vmovddup (%7), %%xmm1 \n\t" // read b
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm1 \n\t" // read b
" vmovddup 8(%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
" \n\t" // i = 0
" subq $16 , %7 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 8
" subq $16 , %3 \n\t" // b = b - 2
" subq $64 , %2 \n\t" // a = a - 8
" vmovddup (%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@@ -135,7 +135,7 @@
#endif
movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCHING

View File

@@ -383,7 +383,7 @@
EMMS
movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCHING

View File

@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"movss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"3: \n\t"
:
"+r" (i), // 0
"+r" (n1) // 1
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",

View File

@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
"vbroadcastss (%3), %%xmm12 \n\t" // x0
"vbroadcastss 4(%3), %%xmm13 \n\t" // x1
"vbroadcastss 8(%3), %%xmm14 \n\t" // x2
"vbroadcastss 12(%3), %%xmm15 \n\t" // x3
"vbroadcastss 16(%3), %%xmm0 \n\t" // x4
"vbroadcastss 20(%3), %%xmm1 \n\t" // x5
"vbroadcastss 24(%3), %%xmm2 \n\t" // x6
"vbroadcastss 28(%3), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %2 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
"2: \n\t"
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"subq $8 , %1 \n\t"
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"prefetcht0 192(%5,%2,4) \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%6,%2,4) \n\t"
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%7,%2,4) \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%8,%2,4) \n\t"
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t"
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
"addq $16, %8 \n\t"
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
"addq $16, %2 \n\t"
"vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
@@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t"
"jz 3f \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
@@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $16, %8 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
"addq $16, %2 \n\t"
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
"jnz 1b \n\t"
@@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
@@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"

View File

@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"movss (%3), %%xmm12 \n\t" // x0
"movss 4(%3), %%xmm13 \n\t" // x1
"movss 8(%3), %%xmm14 \n\t" // x2
"movss 12(%3), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
"movss 16(%2), %%xmm0 \n\t" // x4
"movss 20(%2), %%xmm1 \n\t" // x5
"movss 24(%2), %%xmm2 \n\t" // x6
"movss 28(%2), %%xmm3 \n\t" // x7
"movss 16(%3), %%xmm0 \n\t" // x4
"movss 20(%3), %%xmm1 \n\t" // x5
"movss 24(%3), %%xmm2 \n\t" // x6
"movss 28(%3), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
".p2align 1 \n\t"
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
"movups (%5,%0,4), %%xmm8 \n\t"
"movups (%6,%0,4), %%xmm9 \n\t"
"movups (%7,%0,4), %%xmm10 \n\t"
"movups (%8,%0,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,4), %%xmm8 \n\t"
"movups (%5,%8,4), %%xmm9 \n\t"
"movups (%6,%8,4), %%xmm10 \n\t"
"movups (%7,%8,4), %%xmm11 \n\t"
"movups (%5,%2,4), %%xmm8 \n\t"
"movups (%6,%2,4), %%xmm9 \n\t"
"movups (%7,%2,4), %%xmm10 \n\t"
"movups (%8,%2,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
"movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
"jnz 1b \n\t"
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
"vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
"addq $4, %8 \n\t"
"addq $4, %2 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
"addq $8, %8 \n\t"
"addq $8, %2 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
"prefetcht0 192(%5,%2,4) \n\t"
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%6,%2,4) \n\t"
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"prefetcht0 192(%7,%2,4) \n\t"
"vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%8,%2,4) \n\t"
"vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
"addq $16, %8 \n\t"
"addq $16, %2 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movss %%xmm11,4(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movss %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4

View File

@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4

View File

@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4

View File

@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t"
" vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
" vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
" vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
" vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
" vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
" vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
" vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i]
" vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i]
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i]
" vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i]
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i]
" vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i]
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i]
" vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i]
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i]
" vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i]
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i]
" vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i]
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i]
" vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i]
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i]
" vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i]
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i]
" vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i]
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n
" vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i]
" vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i]
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t"
" vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i]
" vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i]
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i]
" vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i]
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i]
" vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i]
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i]
" vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i]
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i]
" vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i]
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i]
" vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i]
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i]
" vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i]
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i]
" vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i]
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i]
" vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i]
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i]
" vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i]
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
" vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
" vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
" vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
" vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
" vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n
" vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
" vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (c), // 4
"r" (c1), // 5
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 0
" vbroadcastss (%7), %%xmm0 \n\t" // read bb
" vbroadcastss 4(%7), %%xmm1 \n\t" // read b
" vbroadcastss (%3), %%xmm0 \n\t" // read bb
" vbroadcastss 4(%3), %%xmm1 \n\t" // read b
" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"
" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t"
" \n\t" // i = 1
" addq $8 , %7 \n\t" // b = b + 2
" addq $64 , %6 \n\t" // a = a + 16
" addq $8 , %3 \n\t" // b = b + 2
" addq $64 , %2 \n\t" // a = a + 16
" vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
" vbroadcastss 4(%3), %%xmm0 \n\t" // read bb
" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (c), // 4
"r" (c1), // 5
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 1
" vbroadcastss (%7), %%xmm1 \n\t" // read b
" vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
" vbroadcastss (%3), %%xmm1 \n\t" // read b
" vbroadcastss 4(%3), %%xmm0 \n\t" // read bb
" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
" \n\t" // i = 0
" subq $8 , %7 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 16
" subq $8 , %3 \n\t" // b = b - 2
" subq $64 , %2 \n\t" // a = a - 16
" vbroadcastss (%7), %%xmm0 \n\t" // read bb
" vbroadcastss (%3), %%xmm0 \n\t" // read bb
" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"
" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (c), // 4
"r" (c1), // 5
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4

View File

@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -107,10 +107,10 @@ if ( n < 1280 )
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -199,10 +199,10 @@ if ( n < 1280 )
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4

View File

@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",

View File

@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
"+r" (n), // 0
"+r" (x) // 1
:
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",

View File

@@ -1,18 +1,18 @@
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = damax_z13.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = zamax_z13.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
DAMINKERNEL = damin_z13.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = zamin_z13.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
DMAXKERNEL = dmax_z13.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
DMINKERNEL = dmin_z13.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = idamax.c
@@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = idmax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
IDMINKERNEL = idmin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = dasum.c

146
kernel/zarch/KERNEL.Z14 Normal file
View File

@@ -0,0 +1,146 @@
SAMAXKERNEL = samax.c
DAMAXKERNEL = damax.c
CAMAXKERNEL = camax.c
ZAMAXKERNEL = zamax.c
SAMINKERNEL = samin.c
DAMINKERNEL = damin.c
CAMINKERNEL = camin.c
ZAMINKERNEL = zamin.c
SMAXKERNEL = smax.c
DMAXKERNEL = dmax.c
SMINKERNEL = smin.c
DMINKERNEL = dmin.c
ISAMAXKERNEL = isamax.c
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = icamax.c
IZAMAXKERNEL = izamax.c
ISAMINKERNEL = isamin.c
IDAMINKERNEL = idamin.c
ICAMINKERNEL = icamin.c
IZAMINKERNEL = izamin.c
ISMAXKERNEL = ismax.c
IDMAXKERNEL = idmax.c
ISMINKERNEL = ismin.c
IDMINKERNEL = idmin.c
SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SCOPYKERNEL = scopy.c
DCOPYKERNEL = dcopy.c
CCOPYKERNEL = ccopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSDOTKERNEL = dsdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = srot.c
DROTKERNEL = drot.c
CROTKERNEL = crot.c
ZROTKERNEL = zrot.c
SSCALKERNEL = sscal.c
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = sswap.c
DSWAPKERNEL = dswap.c
CSWAPKERNEL = cswap.c
ZSWAPKERNEL = zswap.c
SGEMVNKERNEL = sgemv_n_4.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = cgemv_n_4.c
ZGEMVNKERNEL = zgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = cgemv_t_4.c
ZGEMVTKERNEL = zgemv_t_4.c
STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S
CTRMMKERNEL = ctrmm4x4V.S
ZTRMMKERNEL = ztrmm4x4V.S
SGEMMKERNEL = strmm8x4V.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm8x4V.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ctrmm4x4V.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ztrmm4x4V.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

215
kernel/zarch/camax.c Normal file
View File

@@ -0,0 +1,215 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
}

215
kernel/zarch/camin.c Normal file
View File

@@ -0,0 +1,215 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
}

155
kernel/zarch/casum.c Normal file
View File

@@ -0,0 +1,155 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabsf
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0)
return (sumf);
if (inc_x == 1) {
n1 = n & -32;
if (n1 > 0) {
sumf = casum_kernel_32(n1, x);
i = n1;
ip = 2 * n1;
}
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
}
} else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
}
}
return (sumf);
}

166
kernel/zarch/caxpy.c Normal file
View File

@@ -0,0 +1,166 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ)
"vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
#else
"vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%[alpha])\n\t"
#endif
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"verllg %%v24,%%v8,32\n\t"
"verllg %%v25,%%v9,32\n\t"
"verllg %%v26,%%v10,32\n\t"
"verllg %%v27,%%v11,32\n\t"
"verllg %%v28,%%v16,32\n\t"
"verllg %%v29,%%v17,32\n\t"
"verllg %%v30,%%v18,32\n\t"
"verllg %%v31,%%v19,32\n\t"
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;
}
return (0);
}
inc_x *= 2;
inc_y *= 2;
while (i < n) {
#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;
}
return (0);
}

88
kernel/zarch/ccopy.c Normal file
View File

@@ -0,0 +1,88 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
ccopy_kernel_32(n1, x, y);
i = n1;
ix = n1 * 2;
iy = n1 * 2;
}
while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
iy += 2;
i++;
}
} else {
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;
iy += inc_y2;
i++;
}
}
return (0);
}

176
kernel/zarch/cdot.c Normal file
View File

@@ -0,0 +1,176 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vrepg %%v26,%%v24,1\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vfasb %%v25,%%v25,%%v29\n\t"
"vfasb %%v25,%%v25,%%v31\n\t"
"vrepg %%v27,%%v25,1\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vstef %%v24,0(%[d]),0\n\t"
"vstef %%v24,4(%[d]),1\n\t"
"vstef %%v25,8(%[d]),1\n\t"
"vstef %%v25,12(%[d]),0"
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);
}
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1)
cdot_kernel_16(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
}
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
ix += inc_x;
iy += inc_y;
i++;
}
}
#if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];
#endif
return (result);
}

752
kernel/zarch/cgemv_n_4.c Normal file
View File

@@ -0,0 +1,752 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
"vlrepg %%v18,16(%[x])\n\t"
"vlrepg %%v19,24(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
#else
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vperm %%v25,%%v24,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v24,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap1])\n\t"
"vperm %%v27,%%v26,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v26,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vperm %%v29,%%v28,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v28,%%v1\n\t"
"vl %%v30,0(%%r1,%[ap3])\n\t"
"vperm %%v31,%%v30,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v30,%%v1\n\t"
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
#else
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vperm %%v21,%%v20,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v20,%%v1\n\t"
"vl %%v22,0(%%r1,%[ap1])\n\t"
"vperm %%v23,%%v22,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v22,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
__asm__("vlrepg %%v16,0(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
#else
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v18,0(%%r1,%[ap])\n\t"
"vperm %%v19,%%v18,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v18,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
}
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
FLOAT alpha_i) {
__asm__(
#if !defined(XCONJ)
"vlrepf %%v0,%[alpha_r]\n\t"
"vlef %%v1,%[alpha_i],0\n\t"
"vlef %%v1,%[alpha_i],2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,%[alpha_i],1\n\t"
"vlef %%v1,%[alpha_i],3\n\t"
#else
"vlef %%v0,%[alpha_r],1\n\t"
"vlef %%v0,%[alpha_r],3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,%[alpha_r],0\n\t"
"vlef %%v0,%[alpha_r],2\n\t"
"vlrepf %%v1,%[alpha_i]\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,0(%%r1,%[dest])\n\t"
"vl %%v19,16(%%r1,%[dest])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
"vst %%v22,0(%%r1,%[dest])\n\t"
"vst %%v23,16(%%r1,%[dest])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
[src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
if (inc_dest != 2) {
FLOAT temp_r;
FLOAT temp_i;
for (i = 0; i < n; i++) {
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest + 1) += temp_i;
src += 2;
dest += inc_dest;
}
return;
}
add_y_4(n, src, dest, alpha_r, alpha_i);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT xbuffer[8], *ybuffer;
if (m < 1)
return (0);
if (n < 1)
return (0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
lda4 = 4 * lda;
n1 = n / 4;
n2 = n % 4;
m3 = m % 4;
m1 = m - (m % 4);
m2 = (m % NBMAX) - (m % 4);
y_ptr = y;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
a_ptr = a;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
x_ptr = x;
//zero_y(NB,ybuffer);
memset(ybuffer, 0, NB * 8);
if (inc_x == 2) {
for (i = 0; i < n1; i++) {
cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 8;
}
if (n2 & 2) {
cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if (n2 & 1) {
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
/* x_ptr += 2;
a_ptr += lda; */
}
} else {
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
a_ptr += 1 * lda;
}
}
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
if (m3 == 0)
return (0);
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if (lda == 2 && inc_x == 2) {
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif
a_ptr += 4;
x_ptr += 4;
}
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += 2;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
return (0);
}
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if (lda == 4 && inc_x == 2) {
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
#endif
a_ptr += 8;
x_ptr += 4;
}
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += 4;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
return (0);
}
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if (lda == 6 && inc_x == 2) {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += 6;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
return (0);
}
return (0);
}

724
kernel/zarch/cgemv_t_4.c Normal file
View File

@@ -0,0 +1,724 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
#else
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vperm %%v25,%%v24,%%v24,%%v3\n\t"
"vperm %%v24,%%v24,%%v24,%%v2\n\t"
"vl %%v26,0(%%r1,%[ap1])\n\t"
"vperm %%v27,%%v26,%%v26,%%v3\n\t"
"vperm %%v26,%%v26,%%v26,%%v2\n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vperm %%v29,%%v28,%%v28,%%v3\n\t"
"vperm %%v28,%%v28,%%v28,%%v2\n\t"
"vl %%v30,0(%%r1,%[ap3])\n\t"
"vperm %%v31,%%v30,%%v30,%%v3\n\t"
"vperm %%v30,%%v30,%%v30,%%v2\n\t"
"vfmasb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmasb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmasb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmasb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmasb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmasb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmasb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v20\n\t"
"vfasb %%v17,%%v17,%%v21\n\t"
"vfasb %%v18,%%v18,%%v22\n\t"
"vfasb %%v19,%%v19,%%v23\n\t"
"vrepg %%v20,%%v16,1\n\t"
"vrepg %%v21,%%v17,1\n\t"
"vrepg %%v22,%%v18,1\n\t"
"vrepg %%v23,%%v19,1\n\t"
"vfasb %%v16,%%v16,%%v20\n\t"
"vfasb %%v17,%%v17,%%v21\n\t"
"vfasb %%v18,%%v18,%%v22\n\t"
"vfasb %%v19,%%v19,%%v23\n\t"
"vmrhg %%v16,%%v16,%%v17\n\t"
"vmrhg %%v17,%%v18,%%v19\n\t"
"verllg %%v18,%%v16,32\n\t"
"verllg %%v19,%%v17,32\n\t"
#if !defined(XCONJ)
"vlrepf %%v20,0(%[alpha])\n\t"
"vlef %%v21,4(%[alpha]),0\n\t"
"vlef %%v21,4(%[alpha]),2\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,4(%[alpha]),1\n\t"
"vlef %%v21,4(%[alpha]),3\n\t"
#else
"vlef %%v20,0(%[alpha]),1\n\t"
"vlef %%v20,0(%[alpha]),3\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%[alpha]),0\n\t"
"vlef %%v20,0(%[alpha]),2\n\t"
"vlrepf %%v21,4(%[alpha])\n\t"
#endif
"vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%[y])\n\t"
"vfmasb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmasb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmasb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%[y])"
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
#else
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vperm %%v21,%%v20,%%v20,%%v3\n\t"
"vperm %%v20,%%v20,%%v20,%%v2\n\t"
"vl %%v22,0(%%r1,%[ap1])\n\t"
"vperm %%v23,%%v22,%%v22,%%v3\n\t"
"vperm %%v22,%%v22,%%v22,%%v2\n\t"
"vfmasb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmasb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmasb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmasb %%v19,%%v23,%%v1,%%v19\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v18\n\t"
"vfasb %%v17,%%v17,%%v19\n\t"
"vrepg %%v18,%%v16,1\n\t"
"vrepg %%v19,%%v17,1\n\t"
"vfasb %%v16,%%v16,%%v18\n\t"
"vfasb %%v17,%%v17,%%v19\n\t"
"vmrhg %%v16,%%v16,%%v17\n\t"
"verllg %%v17,%%v16,32\n\t"
#if !defined(XCONJ)
"vlrepf %%v18,0(%[alpha])\n\t"
"vlef %%v19,4(%[alpha]),0\n\t"
"vlef %%v19,4(%[alpha]),2\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,4(%[alpha]),1\n\t"
"vlef %%v19,4(%[alpha]),3\n\t"
#else
"vlef %%v18,0(%[alpha]),1\n\t"
"vlef %%v18,0(%[alpha]),3\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%[alpha]),0\n\t"
"vlef %%v18,0(%[alpha]),2\n\t"
"vlrepf %%v19,4(%[alpha])\n\t"
#endif
"vl %%v20,0(%[y])\n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
"vst %%v20,0(%[y])"
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
#else
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif
"vl %%v18,0(%%r1,%[ap])\n\t"
"vperm %%v19,%%v18,%%v18,%%v3\n\t"
"vperm %%v18,%%v18,%%v18,%%v2\n\t"
"vfmasb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmasb %%v17,%%v19,%%v1,%%v17\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vrepg %%v17,%%v16,1\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"verllg %%v17,%%v16,32\n\t"
#if !defined(XCONJ)
"vlrepf %%v18,0(%[alpha])\n\t"
"vlef %%v19,4(%[alpha]),0\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,4(%[alpha]),1\n\t"
#else
"vlef %%v18,0(%[alpha]),1\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%[alpha]),0\n\t"
"vlrepf %%v19,4(%[alpha])\n\t"
#endif
"vleg %%v0,0(%[y]),0\n\t"
"vfmasb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v17,%%v19,%%v0\n\t"
"vsteg %%v0,0(%[y]),0"
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest = *src;
*(dest + 1) = *(src + 1);
dest += 2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[8];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT ybuffer[8], *xbuffer;
FLOAT alpha[2];
if (m < 1)
return (0);
if (n < 1)
return (0);
inc_x <<= 1;
inc_y <<= 1;
lda <<= 1;
lda4 = lda << 2;
xbuffer = buffer;
n1 = n >> 2;
n2 = n & 3;
m3 = m & 3;
m1 = m - m3;
m2 = (m & (NBMAX - 1)) - m3;
alpha[0] = alpha_r;
alpha[1] = alpha_i;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if (inc_x != 2)
copy_x(NB, x_ptr, xbuffer, inc_x);
else
xbuffer = x_ptr;
if (inc_y == 2) {
for (i = 0; i < n1; i++) {
cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr += 8;
}
if (n2 & 2) {
cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
a_ptr += lda * 2;
y_ptr += 4;
}
if (n2 & 1) {
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
/* a_ptr += lda;
y_ptr += 2; */
}
} else {
for (i = 0; i < n1; i++) {
memset(ybuffer, 0, sizeof(ybuffer));
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
y_ptr[0] += ybuffer[2];
y_ptr[1] += ybuffer[3];
y_ptr += inc_y;
y_ptr[0] += ybuffer[4];
y_ptr[1] += ybuffer[5];
y_ptr += inc_y;
y_ptr[0] += ybuffer[6];
y_ptr[1] += ybuffer[7];
y_ptr += inc_y;
}
for (i = 0; i < n2; i++) {
memset(ybuffer, 0, sizeof(ybuffer));
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
}
}
a += 2 * NB;
x += NB * inc_x;
}
if (m3 == 0)
return (0);
x_ptr = x;
j = 0;
a_ptr = a;
y_ptr = y;
if (m3 == 3) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
if (m3 == 2) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j += 2;
}
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
if (m3 == 1) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j += 2;
}
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
return (0);
}

236
kernel/zarch/crot.c Normal file
View File

@@ -0,0 +1,236 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
crot_kernel_32(n1, x, y, &cosa, &sina);
i = n1;
ix = 2 * n1;
}
while (i < n) {
temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix];
y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];
ix += 2;
i++;
}
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];
ix += inc_x2;
iy += inc_y2;
i++;
}
}
return (0);
}

Some files were not shown because too many files have changed in this diff Show More