diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9a48289c5..2d54920cc 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t.c -ifndef OS_WINDOWS ZGEMVNKERNEL = zgemv_n.c -endif -ZGEMVTKERNEL = zgemv_t.c +#ZGEMVTKERNEL = zgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c index 75e40eccb..7b8907044 100644 --- a/kernel/x86_64/zgemv_n.c +++ b/kernel/x86_64/zgemv_n.c @@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#include +#include #include "common.h" #if defined(HASWELL) @@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, BLASLONG n2; FLOAT xbuffer[8],*ybuffer; + +#if 0 +printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y); +#endif + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + ybuffer = buffer; inc_x *= 2; diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c index bb40ec3ac..e1c5838f7 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c @@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - "prefetcht0 192(%5,%0,8) \n\t" "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 - "vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r - "vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i - "vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r - "vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "prefetcht0 192(%6,%0,8) \n\t" "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 @@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + "prefetcht0 192(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" @@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif - "prefetcht0 192(%3,%0,8) \n\t" - "vmovups (%3,%0,8), %%ymm12 \n\t" - "vmovups 32(%3,%0,8), %%ymm13 \n\t" - -#if !defined(XCONJ) - "vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t" -#else - "vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t" - "vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t" -#endif - + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t"