Merge pull request #538 from wernsaar/develop
Added optimized cdot- and zdot-kernels
This commit is contained in:
commit
62231ab337
|
@ -3,7 +3,8 @@ CAXPYKERNEL = caxpy.c
|
||||||
ZAXPYKERNEL = zaxpy.c
|
ZAXPYKERNEL = zaxpy.c
|
||||||
|
|
||||||
SDOTKERNEL = sdot.c
|
SDOTKERNEL = sdot.c
|
||||||
#DDOTKERNEL = ddot.c
|
CDOTKERNEL = cdot.c
|
||||||
|
ZDOTKERNEL = zdot.c
|
||||||
|
|
||||||
DSYMV_U_KERNEL = dsymv_U.c
|
DSYMV_U_KERNEL = dsymv_U.c
|
||||||
DSYMV_L_KERNEL = dsymv_L.c
|
DSYMV_L_KERNEL = dsymv_L.c
|
||||||
|
@ -26,11 +27,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
||||||
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
||||||
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
||||||
|
@ -40,6 +41,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
@ -49,6 +51,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
||||||
ZGEMMINCOPY =
|
ZGEMMINCOPY =
|
||||||
ZGEMMITCOPY =
|
ZGEMMITCOPY =
|
||||||
|
|
|
@ -12,6 +12,9 @@ CGEMVTKERNEL = cgemv_t_4.c
|
||||||
|
|
||||||
SDOTKERNEL = sdot.c
|
SDOTKERNEL = sdot.c
|
||||||
DDOTKERNEL = ddot.c
|
DDOTKERNEL = ddot.c
|
||||||
|
CDOTKERNEL = cdot.c
|
||||||
|
ZDOTKERNEL = zdot.c
|
||||||
|
|
||||||
|
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
SAXPYKERNEL = saxpy.c
|
SAXPYKERNEL = saxpy.c
|
||||||
|
|
|
@ -12,6 +12,9 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||||
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||||
|
|
||||||
SDOTKERNEL = sdot.c
|
SDOTKERNEL = sdot.c
|
||||||
|
CDOTKERNEL = cdot.c
|
||||||
|
ZDOTKERNEL = zdot.c
|
||||||
|
|
||||||
DDOTKERNEL = ddot_bulldozer.S
|
DDOTKERNEL = ddot_bulldozer.S
|
||||||
DCOPYKERNEL = dcopy_bulldozer.S
|
DCOPYKERNEL = dcopy_bulldozer.S
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,9 @@ ZGEMVNKERNEL = zgemv_n_4.c
|
||||||
|
|
||||||
SDOTKERNEL = sdot.c
|
SDOTKERNEL = sdot.c
|
||||||
DDOTKERNEL = ddot.c
|
DDOTKERNEL = ddot.c
|
||||||
|
CDOTKERNEL = cdot.c
|
||||||
|
ZDOTKERNEL = zdot.c
|
||||||
|
|
||||||
|
|
||||||
SAXPYKERNEL = saxpy.c
|
SAXPYKERNEL = saxpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
|
|
|
@ -4,6 +4,9 @@ ZAXPYKERNEL = zaxpy.c
|
||||||
|
|
||||||
SDOTKERNEL = sdot.c
|
SDOTKERNEL = sdot.c
|
||||||
DDOTKERNEL = ddot.c
|
DDOTKERNEL = ddot.c
|
||||||
|
CDOTKERNEL = cdot.c
|
||||||
|
ZDOTKERNEL = zdot.c
|
||||||
|
|
||||||
|
|
||||||
DSYMV_U_KERNEL = dsymv_U.c
|
DSYMV_U_KERNEL = dsymv_U.c
|
||||||
DSYMV_L_KERNEL = dsymv_L.c
|
DSYMV_L_KERNEL = dsymv_L.c
|
||||||
|
|
|
@ -0,0 +1,174 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2015, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <complex.h>
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(BULLDOZER)
|
||||||
|
#include "cdot_microk_bulldozer-2.c"
|
||||||
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
|
||||||
|
#include "cdot_microk_steamroller-2.c"
|
||||||
|
#elif defined(HASWELL)
|
||||||
|
#include "cdot_microk_haswell-2.c"
|
||||||
|
#elif defined(SANDYBRIDGE)
|
||||||
|
#include "cdot_microk_sandy-2.c"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
||||||
|
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
|
{
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
|
||||||
|
BLASLONG j=0;
|
||||||
|
|
||||||
|
while( i < n )
|
||||||
|
{
|
||||||
|
|
||||||
|
dot[0] += x[j] * y[j] ;
|
||||||
|
dot[1] += x[j+1] * y[j+1] ;
|
||||||
|
dot[4] += x[j] * y[j+1] ;
|
||||||
|
dot[5] += x[j+1] * y[j] ;
|
||||||
|
|
||||||
|
dot[2] += x[j+2] * y[j+2] ;
|
||||||
|
dot[3] += x[j+3] * y[j+3] ;
|
||||||
|
dot[6] += x[j+2] * y[j+3] ;
|
||||||
|
dot[7] += x[j+3] * y[j+2] ;
|
||||||
|
|
||||||
|
dot[0] += x[j+4] * y[j+4] ;
|
||||||
|
dot[1] += x[j+5] * y[j+5] ;
|
||||||
|
dot[4] += x[j+4] * y[j+5] ;
|
||||||
|
dot[5] += x[j+5] * y[j+4] ;
|
||||||
|
|
||||||
|
dot[2] += x[j+6] * y[j+6] ;
|
||||||
|
dot[3] += x[j+7] * y[j+7] ;
|
||||||
|
dot[6] += x[j+6] * y[j+7] ;
|
||||||
|
dot[7] += x[j+7] * y[j+6] ;
|
||||||
|
|
||||||
|
j+=8;
|
||||||
|
i+=4;
|
||||||
|
|
||||||
|
}
|
||||||
|
d[0] = dot[0];
|
||||||
|
d[1] = dot[1];
|
||||||
|
d[2] = dot[2];
|
||||||
|
d[3] = dot[3];
|
||||||
|
d[4] = dot[4];
|
||||||
|
d[5] = dot[5];
|
||||||
|
d[6] = dot[6];
|
||||||
|
d[7] = dot[7];
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
{
|
||||||
|
BLASLONG i;
|
||||||
|
BLASLONG ix,iy;
|
||||||
|
FLOAT _Complex result;
|
||||||
|
FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ;
|
||||||
|
|
||||||
|
if ( n <= 0 )
|
||||||
|
{
|
||||||
|
__real__ result = 0.0 ;
|
||||||
|
__imag__ result = 0.0 ;
|
||||||
|
return(result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (inc_x == 1) && (inc_y == 1) )
|
||||||
|
{
|
||||||
|
|
||||||
|
int n1 = n & -16;
|
||||||
|
|
||||||
|
if ( n1 )
|
||||||
|
{
|
||||||
|
cdot_kernel_16(n1, x, y , dot );
|
||||||
|
dot[0] += dot[2];
|
||||||
|
dot[1] += dot[3];
|
||||||
|
dot[4] += dot[6];
|
||||||
|
dot[5] += dot[7];
|
||||||
|
}
|
||||||
|
i = n1;
|
||||||
|
int j = i * 2;
|
||||||
|
while( i < n )
|
||||||
|
{
|
||||||
|
|
||||||
|
dot[0] += x[j] * y[j] ;
|
||||||
|
dot[1] += x[j+1] * y[j+1] ;
|
||||||
|
dot[4] += x[j] * y[j+1] ;
|
||||||
|
dot[5] += x[j+1] * y[j] ;
|
||||||
|
|
||||||
|
j+=2;
|
||||||
|
i++ ;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
i=0;
|
||||||
|
ix=0;
|
||||||
|
iy=0;
|
||||||
|
inc_x <<= 1;
|
||||||
|
inc_y <<= 1;
|
||||||
|
while(i < n)
|
||||||
|
{
|
||||||
|
|
||||||
|
dot[0] += x[ix] * y[iy] ;
|
||||||
|
dot[1] += x[ix+1] * y[iy+1] ;
|
||||||
|
dot[4] += x[ix] * y[iy+1] ;
|
||||||
|
dot[5] += x[ix+1] * y[iy] ;
|
||||||
|
|
||||||
|
ix += inc_x ;
|
||||||
|
iy += inc_y ;
|
||||||
|
i++ ;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !defined(CONJ)
|
||||||
|
__real__ result = dot[0] - dot[1];
|
||||||
|
__imag__ result = dot[4] + dot[5];
|
||||||
|
#else
|
||||||
|
__real__ result = dot[0] + dot[1];
|
||||||
|
__imag__ result = dot[4] - dot[5];
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return(result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
if ( n <=1024 )
|
||||||
|
{
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
|
||||||
|
"vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y
|
||||||
|
"vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x
|
||||||
|
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y
|
||||||
|
"vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part
|
||||||
|
"vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
|
"vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddps %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddps %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"prefetcht0 384(%2,%0,4) \n\t"
|
||||||
|
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
|
||||||
|
"vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"prefetcht0 384(%3,%0,4) \n\t"
|
||||||
|
"vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y
|
||||||
|
"vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x
|
||||||
|
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y
|
||||||
|
"vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part
|
||||||
|
"vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
|
"vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddps %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddps %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,119 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t"
|
||||||
|
"vxorps %%ymm1, %%ymm1, %%ymm1 \n\t"
|
||||||
|
"vxorps %%ymm2, %%ymm2, %%ymm2 \n\t"
|
||||||
|
"vxorps %%ymm3, %%ymm3, %%ymm3 \n\t"
|
||||||
|
"vxorps %%ymm4, %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorps %%ymm5, %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
|
||||||
|
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y
|
||||||
|
"vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 64(%2,%0,4), %%ymm10 \n\t" // 2 * x
|
||||||
|
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y
|
||||||
|
"vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231ps %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $32 , %0 \n\t"
|
||||||
|
"vfmadd231ps %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231ps %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $16 , %1 \n\t"
|
||||||
|
"vfmadd231ps %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddps %%ymm0, %%ymm1, %%ymm0 \n\t"
|
||||||
|
"vaddps %%ymm2, %%ymm3, %%ymm2 \n\t"
|
||||||
|
"vaddps %%ymm0, %%ymm2, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%ymm4, %%ymm5, %%ymm4 \n\t"
|
||||||
|
"vaddps %%ymm6, %%ymm7, %%ymm6 \n\t"
|
||||||
|
"vaddps %%ymm4, %%ymm6, %%ymm4 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $1 , %%ymm0 , %%xmm1 \n\t"
|
||||||
|
"vextractf128 $1 , %%ymm4 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,127 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t"
|
||||||
|
"vxorps %%ymm1, %%ymm1, %%ymm1 \n\t"
|
||||||
|
"vxorps %%ymm2, %%ymm2, %%ymm2 \n\t"
|
||||||
|
"vxorps %%ymm3, %%ymm3, %%ymm3 \n\t"
|
||||||
|
"vxorps %%ymm4, %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorps %%ymm5, %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
|
||||||
|
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y
|
||||||
|
"vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y
|
||||||
|
"vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
"vaddps %%ymm0 , %%ymm10, %%ymm0 \n\t"
|
||||||
|
"vaddps %%ymm1 , %%ymm11, %%ymm1 \n\t"
|
||||||
|
"vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t"
|
||||||
|
|
||||||
|
"vmovups 64(%2,%0,4), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 96(%2,%0,4), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vaddps %%ymm4 , %%ymm10, %%ymm4 \n\t"
|
||||||
|
"vaddps %%ymm5 , %%ymm11, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t"
|
||||||
|
"vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||||
|
"vaddps %%ymm2 , %%ymm10, %%ymm2 \n\t"
|
||||||
|
"vaddps %%ymm3 , %%ymm11, %%ymm3 \n\t"
|
||||||
|
"vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t"
|
||||||
|
"vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t"
|
||||||
|
"addq $32 , %0 \n\t"
|
||||||
|
"subq $16 , %1 \n\t"
|
||||||
|
"vaddps %%ymm6 , %%ymm10, %%ymm6 \n\t"
|
||||||
|
"vaddps %%ymm7 , %%ymm11, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddps %%ymm0, %%ymm1, %%ymm0 \n\t"
|
||||||
|
"vaddps %%ymm2, %%ymm3, %%ymm2 \n\t"
|
||||||
|
"vaddps %%ymm0, %%ymm2, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%ymm4, %%ymm5, %%ymm4 \n\t"
|
||||||
|
"vaddps %%ymm6, %%ymm7, %%ymm6 \n\t"
|
||||||
|
"vaddps %%ymm4, %%ymm6, %%ymm4 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $1 , %%ymm0 , %%xmm1 \n\t"
|
||||||
|
"vextractf128 $1 , %%ymm4 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
if ( n < 1280 )
|
||||||
|
{
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
|
||||||
|
"vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y
|
||||||
|
"vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x
|
||||||
|
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y
|
||||||
|
"vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part
|
||||||
|
"vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddps %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddps %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"prefetcht0 512(%2,%0,4) \n\t"
|
||||||
|
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
|
||||||
|
"vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"prefetcht0 512(%3,%0,4) \n\t"
|
||||||
|
"vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y
|
||||||
|
"vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x
|
||||||
|
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y
|
||||||
|
"vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part
|
||||||
|
"vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddps %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddps %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,165 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2015, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <complex.h>
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(BULLDOZER)
|
||||||
|
#include "zdot_microk_bulldozer-2.c"
|
||||||
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
|
||||||
|
#include "zdot_microk_steamroller-2.c"
|
||||||
|
#elif defined(HASWELL)
|
||||||
|
#include "zdot_microk_haswell-2.c"
|
||||||
|
#elif defined(SANDYBRIDGE)
|
||||||
|
#include "zdot_microk_sandy-2.c"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef HAVE_KERNEL_8
|
||||||
|
|
||||||
|
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
|
{
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||||
|
BLASLONG j=0;
|
||||||
|
|
||||||
|
while( i < n )
|
||||||
|
{
|
||||||
|
|
||||||
|
dot[0] += x[j] * y[j] ;
|
||||||
|
dot[1] += x[j+1] * y[j+1] ;
|
||||||
|
dot[2] += x[j] * y[j+1] ;
|
||||||
|
dot[3] += x[j+1] * y[j] ;
|
||||||
|
|
||||||
|
dot[0] += x[j+2] * y[j+2] ;
|
||||||
|
dot[1] += x[j+3] * y[j+3] ;
|
||||||
|
dot[2] += x[j+2] * y[j+3] ;
|
||||||
|
dot[3] += x[j+3] * y[j+2] ;
|
||||||
|
|
||||||
|
dot[0] += x[j+4] * y[j+4] ;
|
||||||
|
dot[1] += x[j+5] * y[j+5] ;
|
||||||
|
dot[2] += x[j+4] * y[j+5] ;
|
||||||
|
dot[3] += x[j+5] * y[j+4] ;
|
||||||
|
|
||||||
|
dot[0] += x[j+6] * y[j+6] ;
|
||||||
|
dot[1] += x[j+7] * y[j+7] ;
|
||||||
|
dot[2] += x[j+6] * y[j+7] ;
|
||||||
|
dot[3] += x[j+7] * y[j+6] ;
|
||||||
|
|
||||||
|
j+=8;
|
||||||
|
i+=4;
|
||||||
|
|
||||||
|
}
|
||||||
|
d[0] = dot[0];
|
||||||
|
d[1] = dot[1];
|
||||||
|
d[2] = dot[2];
|
||||||
|
d[3] = dot[3];
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
{
|
||||||
|
BLASLONG i;
|
||||||
|
BLASLONG ix,iy;
|
||||||
|
FLOAT _Complex result;
|
||||||
|
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
|
||||||
|
|
||||||
|
if ( n <= 0 )
|
||||||
|
{
|
||||||
|
__real__ result = 0.0 ;
|
||||||
|
__imag__ result = 0.0 ;
|
||||||
|
return(result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (inc_x == 1) && (inc_y == 1) )
|
||||||
|
{
|
||||||
|
|
||||||
|
int n1 = n & -8;
|
||||||
|
|
||||||
|
if ( n1 )
|
||||||
|
zdot_kernel_8(n1, x, y , dot );
|
||||||
|
|
||||||
|
i = n1;
|
||||||
|
int j = i * 2;
|
||||||
|
while( i < n )
|
||||||
|
{
|
||||||
|
|
||||||
|
dot[0] += x[j] * y[j] ;
|
||||||
|
dot[1] += x[j+1] * y[j+1] ;
|
||||||
|
dot[2] += x[j] * y[j+1] ;
|
||||||
|
dot[3] += x[j+1] * y[j] ;
|
||||||
|
|
||||||
|
j+=2;
|
||||||
|
i++ ;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
i=0;
|
||||||
|
ix=0;
|
||||||
|
iy=0;
|
||||||
|
inc_x <<= 1;
|
||||||
|
inc_y <<= 1;
|
||||||
|
while(i < n)
|
||||||
|
{
|
||||||
|
|
||||||
|
dot[0] += x[ix] * y[iy] ;
|
||||||
|
dot[1] += x[ix+1] * y[iy+1] ;
|
||||||
|
dot[2] += x[ix] * y[iy+1] ;
|
||||||
|
dot[3] += x[ix+1] * y[iy] ;
|
||||||
|
|
||||||
|
ix += inc_x ;
|
||||||
|
iy += inc_y ;
|
||||||
|
i++ ;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !defined(CONJ)
|
||||||
|
__real__ result = dot[0] - dot[1];
|
||||||
|
__imag__ result = dot[2] + dot[3];
|
||||||
|
#else
|
||||||
|
__real__ result = dot[0] + dot[1];
|
||||||
|
__imag__ result = dot[2] - dot[3];
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return(result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
if ( n < 768 )
|
||||||
|
{
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x
|
||||||
|
"vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y
|
||||||
|
"vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x
|
||||||
|
"vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y
|
||||||
|
"vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
|
"vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
"vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
"vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"prefetcht0 384(%2,%0,8) \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x
|
||||||
|
"vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"prefetcht0 384(%3,%0,8) \n\t"
|
||||||
|
"vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y
|
||||||
|
"vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x
|
||||||
|
"vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y
|
||||||
|
"vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
|
"vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
|
||||||
|
"vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
"vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
"vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,210 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
if ( n <=1280 )
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t"
|
||||||
|
"vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t"
|
||||||
|
"vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t"
|
||||||
|
"vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t"
|
||||||
|
"vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
|
||||||
|
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y
|
||||||
|
"vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x
|
||||||
|
"vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y
|
||||||
|
"vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||||
|
"vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||||
|
"vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t"
|
||||||
|
"vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t"
|
||||||
|
"vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t"
|
||||||
|
"vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t"
|
||||||
|
"vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $1 , %%ymm0 , %%xmm1 \n\t"
|
||||||
|
"vextractf128 $1 , %%ymm4 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t"
|
||||||
|
"vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t"
|
||||||
|
"vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t"
|
||||||
|
"vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t"
|
||||||
|
"vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
|
||||||
|
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"prefetcht0 512(%2,%0,8) \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"prefetcht0 512(%3,%0,8) \n\t"
|
||||||
|
"vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y
|
||||||
|
"vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"prefetcht0 576(%2,%0,8) \n\t"
|
||||||
|
"vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x
|
||||||
|
"vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"prefetcht0 576(%3,%0,8) \n\t"
|
||||||
|
"vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y
|
||||||
|
"vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||||
|
"vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||||
|
"vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t"
|
||||||
|
"vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t"
|
||||||
|
"vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t"
|
||||||
|
"vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t"
|
||||||
|
"vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $1 , %%ymm0 , %%xmm1 \n\t"
|
||||||
|
"vextractf128 $1 , %%ymm4 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,222 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
if ( n < 1280 )
|
||||||
|
{
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t"
|
||||||
|
"vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t"
|
||||||
|
"vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t"
|
||||||
|
"vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t"
|
||||||
|
"vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
|
||||||
|
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y
|
||||||
|
"vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y
|
||||||
|
"vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
"vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t"
|
||||||
|
"vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t"
|
||||||
|
"vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t"
|
||||||
|
"vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x
|
||||||
|
"vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t"
|
||||||
|
"vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
"vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||||
|
"vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t"
|
||||||
|
"vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t"
|
||||||
|
"vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t"
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t"
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t"
|
||||||
|
"vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t"
|
||||||
|
"vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t"
|
||||||
|
"vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t"
|
||||||
|
"vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $1 , %%ymm0 , %%xmm1 \n\t"
|
||||||
|
"vextractf128 $1 , %%ymm4 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t"
|
||||||
|
"vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t"
|
||||||
|
"vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t"
|
||||||
|
"vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t"
|
||||||
|
"vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
|
||||||
|
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"prefetcht0 512(%2,%0,8) \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"prefetcht0 512(%3,%0,8) \n\t"
|
||||||
|
"vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y
|
||||||
|
"vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y
|
||||||
|
"vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y
|
||||||
|
|
||||||
|
"prefetcht0 576(%3,%0,8) \n\t"
|
||||||
|
"vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t"
|
||||||
|
"prefetcht0 576(%2,%0,8) \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
"vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t"
|
||||||
|
"vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t"
|
||||||
|
"vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t"
|
||||||
|
"vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
"vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x
|
||||||
|
"vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t"
|
||||||
|
"vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
"vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||||
|
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||||
|
"vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t"
|
||||||
|
"vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t"
|
||||||
|
"vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t"
|
||||||
|
"addq $16 , %0 \n\t"
|
||||||
|
"vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t"
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t"
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t"
|
||||||
|
"vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t"
|
||||||
|
"vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t"
|
||||||
|
"vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t"
|
||||||
|
"vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $1 , %%ymm0 , %%xmm1 \n\t"
|
||||||
|
"vextractf128 $1 , %%ymm4 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,193 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
if ( n < 640 )
|
||||||
|
{
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
//"prefetcht0 512(%2,%0,8) \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x
|
||||||
|
"vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x
|
||||||
|
|
||||||
|
// "prefetcht0 512(%3,%0,8) \n\t"
|
||||||
|
"vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y
|
||||||
|
"vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x
|
||||||
|
"vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y
|
||||||
|
"vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
|
||||||
|
"vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
"vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
"vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
"vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||||
|
"vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||||
|
"vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||||
|
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"prefetcht0 512(%2,%0,8) \n\t"
|
||||||
|
"vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x
|
||||||
|
"vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"prefetcht0 512(%3,%0,8) \n\t"
|
||||||
|
"vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y
|
||||||
|
"vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x
|
||||||
|
"vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x
|
||||||
|
|
||||||
|
"vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y
|
||||||
|
"vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y
|
||||||
|
|
||||||
|
"vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
|
||||||
|
"vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i
|
||||||
|
"vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
|
||||||
|
"vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
"vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
"vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
|
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%xmm0, (%4) \n\t"
|
||||||
|
"vmovups %%xmm4, 16(%4) \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (dot) // 4
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue