From 4f2b12b8a887808db8e090d4777074eaf512245a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 19 Jun 2013 17:32:42 +0200 Subject: [PATCH 1/5] added dgemv_t_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/dgemv_t_bulldozer.S | 1938 +++++++++++++++++++++++++++++ 2 files changed, 1939 insertions(+) create mode 100644 kernel/x86_64/dgemv_t_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 59ae72ce2..f481f3376 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -2,6 +2,7 @@ ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t_dup.S DGEMVNKERNEL = dgemv_n_bulldozer.S +DGEMVTKERNEL = dgemv_t_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/dgemv_t_bulldozer.S b/kernel/x86_64/dgemv_t_bulldozer.S new file mode 100644 index 000000000..487ff77ad --- /dev/null +++ b/kernel/x86_64/dgemv_t_bulldozer.S @@ -0,0 +1,1938 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +// #undef ALIGNED_ACCESS + +#define A_PRE 256 + +#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS +#define VMOVUPS_XL1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS + + + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +//Temp variables for M,N,A,LDA +#define MMM 224(%rsp) +#define NN 232(%rsp) +#define AA 240(%rsp) +#define LDAX 248(%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp +#define X1 %r15 + +#ifdef ALIGNED_ACCESS +#define MM INCX +#else +#define MM M +#endif + +#define ALPHA %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movq M, MMM + movq N, NN + movq A, AA + movq LDA, LDAX + +#else + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX +#endif +#ifdef HAVE_SSE3 +#ifndef WINDOWS_ABI + movddup %xmm0, ALPHA +#else + movddup %xmm3, ALPHA +#endif +#else +#ifndef WINDOWS_ABI + vmovups %xmm0, ALPHA +#else + vmovups %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA +#endif + + + +.L0x: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + + leaq -1(INCX), %rax + + leaq (,LDA, SIZE), LDA + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + + movq BUFFER, X1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L01 + + vmovsd (X), %xmm0 + addq INCX, X + + vmovsd %xmm0, 1 * SIZE(BUFFER) + addq $1 * SIZE, BUFFER + addq $2 * SIZE, X1 + decq M + jle .L10 + ALIGN_4 + +.L01: +#endif + + movq M, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + vmovsd (X), %xmm0 + addq INCX, X + vmovhpd (X), %xmm0 , %xmm0 + addq INCX, X + + vmovsd (X), %xmm1 + addq INCX, X + vmovhpd (X), %xmm1 , %xmm1 + addq INCX, X + + vmovsd (X), %xmm2 + addq INCX, X + vmovhpd (X), %xmm2 , %xmm2 + addq INCX, X + + vmovsd (X), %xmm3 + addq INCX, X + vmovhpd (X), %xmm3 , %xmm3 + addq INCX, X + + vmovups %xmm0, 0 * SIZE(X1) + vmovups %xmm1, 2 * SIZE(X1) + vmovups %xmm2, 4 * SIZE(X1) + vmovups %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + vmovsd (X), %xmm0 + addq INCX, X + vmovsd %xmm0, 0 * SIZE(X1) + addq $SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 4), A2 + leaq (A1, LDA, 8), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + vxorps %xmm4 , %xmm4, %xmm4 + vxorps %xmm5 , %xmm5, %xmm5 + vxorps %xmm6 , %xmm6, %xmm6 + vxorps %xmm7 , %xmm7, %xmm7 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A1, LDA3), %xmm11 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + vmovsd -16 * SIZE(A2), %xmm8 + vmovsd -16 * SIZE(A2, LDA), %xmm9 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A2, LDA3), %xmm11 + + vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4 + vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5 + vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6 + vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7 + + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L1X: +#endif + + movq M, I + sarq $3, I + jle .L15 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L13 + ALIGN_4 + +.L12: + + prefetchnta A_PRE(A1) + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + prefetchnta A_PRE(A1,LDA,1) + prefetchnta A_PRE(A2,LDA,1) + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + prefetchnta A_PRE(A1,LDA,2) + prefetchnta A_PRE(A2,LDA,2) + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + prefetchnta A_PRE(A1,LDA3,1) + prefetchnta A_PRE(A2,LDA3,1) + vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -12 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -12 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -12 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -10 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -10 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -10 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L12 + ALIGN_4 + +.L13: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -12 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -12 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -12 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -10 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -10 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -10 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L15: + testq $4, M + jle .L16 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L16: + testq $2, M + jle .L17 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L17: + testq $1, M + je .L18 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A1, LDA3), %xmm11 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + vmovsd -16 * SIZE(A2), %xmm8 + vmovsd -16 * SIZE(A2, LDA), %xmm9 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A2, LDA3), %xmm11 + + vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4 + vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5 + vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6 + vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7 + + ALIGN_4 + +.L18: + vhaddpd %xmm1, %xmm0 , %xmm0 + vhaddpd %xmm3, %xmm2 , %xmm2 + vhaddpd %xmm5, %xmm4 , %xmm4 + vhaddpd %xmm7, %xmm6 , %xmm6 + + vmulpd ALPHA, %xmm0 , %xmm0 + vmulpd ALPHA, %xmm2 , %xmm2 + vmulpd ALPHA, %xmm4 , %xmm4 + vmulpd ALPHA, %xmm6 , %xmm6 + + cmpq $SIZE, INCY + jne .L19 + + vaddpd 0 * SIZE(Y), %xmm0 , %xmm0 + vaddpd 2 * SIZE(Y), %xmm2 , %xmm2 + vaddpd 4 * SIZE(Y), %xmm4 , %xmm4 + vaddpd 6 * SIZE(Y), %xmm6 , %xmm6 + addq $8 * SIZE, Y + + vmovups %xmm0, 0 * SIZE(Y1) + vmovups %xmm2, 2 * SIZE(Y1) + vmovups %xmm4, 4 * SIZE(Y1) + vmovups %xmm6, 6 * SIZE(Y1) + addq $8 * SIZE, Y1 + + cmpq $8, N + jge .L11 + jmp .L20 + ALIGN_4 + +.L19: + + vmovsd (Y), %xmm8 + addq INCY, Y + vmovhpd (Y), %xmm8 , %xmm8 + addq INCY, Y + vmovsd (Y), %xmm9 + addq INCY, Y + vmovhpd (Y), %xmm9 , %xmm9 + addq INCY, Y + vmovsd (Y), %xmm10 + addq INCY, Y + vmovhpd (Y), %xmm10 , %xmm10 + addq INCY, Y + vmovsd (Y), %xmm11 + addq INCY, Y + vmovhpd (Y), %xmm11 , %xmm11 + addq INCY, Y + + vaddpd %xmm8, %xmm0 , %xmm0 + vaddpd %xmm9, %xmm2 , %xmm2 + vaddpd %xmm10, %xmm4 , %xmm4 + vaddpd %xmm11, %xmm6 , %xmm6 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + vmovlpd %xmm2, (Y1) + addq INCY, Y1 + vmovhpd %xmm2, (Y1) + addq INCY, Y1 + vmovlpd %xmm4, (Y1) + addq INCY, Y1 + vmovhpd %xmm4, (Y1) + addq INCY, Y1 + vmovlpd %xmm6, (Y1) + addq INCY, Y1 + vmovhpd %xmm6, (Y1) + addq INCY, Y1 + + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A2), %xmm10 + vmovsd -16 * SIZE(A2, LDA), %xmm11 + + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L2X: +#endif + + movq M, I + sarq $3, I + jle .L25 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L23 + ALIGN_4 + +.L22: + + prefetchnta A_PRE(A1) + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + prefetchnta A_PRE(A1,LDA,1) + prefetchnta A_PRE(A2,LDA,1) + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L22 + ALIGN_4 + +.L23: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L25: + testq $4, M + jle .L26 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L26: + testq $2, M + jle .L27 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L27: + testq $1, M + je .L28 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A2), %xmm10 + vmovsd -16 * SIZE(A2, LDA), %xmm11 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + ALIGN_4 + +.L28: + vhaddpd %xmm1, %xmm0 , %xmm0 + vhaddpd %xmm3, %xmm2 , %xmm2 + + vmulpd ALPHA, %xmm0 , %xmm0 + vmulpd ALPHA, %xmm2 , %xmm2 + + cmpq $SIZE, INCY + jne .L29 + + vmovups 0 * SIZE(Y), %xmm4 + vmovups 2 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovups %xmm0, 0 * SIZE(Y1) + vmovups %xmm2, 2 * SIZE(Y1) + addq $4 * SIZE, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + jmp .L30 + ALIGN_4 + +.L29: + + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + vmovsd (Y), %xmm5 + addq INCY, Y + vmovhpd (Y), %xmm5 , %xmm5 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + vmovlpd %xmm2, (Y1) + addq INCY, Y1 + vmovhpd %xmm2, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + ALIGN_4 + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A2), %xmm9 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L3X: +#endif + + movq M, I + sarq $3, I + jle .L35 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L33 + ALIGN_4 + +.L32: + + prefetchnta A_PRE(A1) + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A2) , %xmm12 , %xmm1 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm13 , %xmm3 + + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L32 + ALIGN_4 + +.L33: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A2) , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm13 , %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L35: + testq $4, M + jle .L36 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L36: + testq $2, M + jle .L37 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L37: + testq $1, M + je .L38 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A2), %xmm9 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + + ALIGN_4 + +.L38: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + mulpd ALPHA, %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L31 +#endif + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + movsd -16 * SIZE(X1), %xmm12 + movsd -16 * SIZE(A1), %xmm8 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L4X: +#endif + + movq M, I + sarq $3, I + jle .L45 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L43 + ALIGN_4 + +.L42: + + prefetchnta A_PRE(A1) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L42 + ALIGN_4 + +.L43: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L45: + testq $4, M + jle .L46 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L46: + testq $2, M + jle .L47 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L47: + testq $1, M + je .L48 + + vmovsd -16 * SIZE(X1), %xmm12 + vmovsd -16 * SIZE(A1), %xmm8 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + ALIGN_4 + +.L48: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vaddpd %xmm1, %xmm0 , %xmm0 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + vmulsd ALPHA, %xmm0 , %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + + vaddsd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_4 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L5X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A2), %xmm6 + vmovsd -16 * SIZE(A2, LDA), %xmm7 + + vfmaddpd %xmm0, %xmm4 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm5 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm6 , %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm7 , %xmm12, %xmm3 + + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L5X: +#endif + + vxorps %xmm8 , %xmm8, %xmm8 + vxorps %xmm9 , %xmm9, %xmm9 + vmovhpd -16 * SIZE(A1, LDA), %xmm8 , %xmm8 + vmovhpd -16 * SIZE(A2, LDA), %xmm9 , %xmm9 + + movq M, I + sarq $3, I + jle .L55 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L53 + ALIGN_4 + +.L52: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + VMOVUPS_A1(-12 * SIZE, A1, %xmm4) + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L52 + ALIGN_4 + +.L53: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L55: + testq $4, M + jle .L56 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L56: + testq $2, M + jle .L57 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L57: + testq $1, M + je .L58 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm6 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + + vshufpd $1, %xmm8, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , %xmm6 , %xmm12 , %xmm2 + + vshufpd $1, %xmm9, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + + ALIGN_4 + +.L58: + vhaddpd %xmm1, %xmm0 , %xmm0 + vhaddpd %xmm3, %xmm2 , %xmm2 + + vmulpd ALPHA, %xmm0 , %xmm0 + vmulpd ALPHA, %xmm2 , %xmm2 + + cmpq $SIZE, INCY + jne .L59 + + vmovups 0 * SIZE(Y), %xmm4 + vmovups 2 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovups %xmm0, 0 * SIZE(Y1) + vmovups %xmm2, 2 * SIZE(Y1) + addq $4 * SIZE, Y1 + + cmpq $4, N + jge .L51 + jmp .L60 + ALIGN_4 + +.L59: + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + vmovsd (Y), %xmm5 + addq INCY, Y + vmovhpd (Y), %xmm5 , %xmm5 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + vmovlpd %xmm2, (Y1) + addq INCY, Y1 + vmovhpd %xmm2, (Y1) + addq INCY, Y1 + cmpq $4, N + jge .L51 + ALIGN_4 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L6X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm5 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + vfmaddpd %xmm1 , %xmm5 , %xmm12 , %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L6X: +#endif + + vxorps %xmm8 , %xmm8, %xmm8 + vmovhpd -16 * SIZE(A2), %xmm8 , %xmm8 + + movq M, I + sarq $3, I + jle .L65 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + VMOVUPS_A1(-13 * SIZE, A2, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L63 + ALIGN_4 + +.L62: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A1(-11 * SIZE, A2, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm7, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1 + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + VMOVUPS_A1(-7 * SIZE, A2, %xmm5) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm9 , %xmm9 + vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1 + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + VMOVUPS_A1(-5 * SIZE, A2, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L62 + ALIGN_4 + +.L63: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A1(-11 * SIZE, A2, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm7, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1 + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm9 , %xmm9 + vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1 + + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L65: + testq $4, M + jle .L66 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + VMOVUPS_A1(-13 * SIZE, A2, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm7, %xmm5 , %xmm5 + vmovups %xmm7, %xmm8 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L66: + testq $2, M + jle .L67 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + movaps %xmm5, %xmm8 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L67: + testq $1, M + je .L68 + + vmovsd -16 * SIZE(X1), %xmm12 + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vshufpd $1, %xmm8, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + ALIGN_4 + +.L68: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + vmulpd ALPHA, %xmm0 , %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L61 +#endif + ALIGN_4 + +.L70: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L7X + + vmovsd -16 * SIZE(X1), %xmm12 + vmovsd -16 * SIZE(A1), %xmm4 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L7X: +#endif + movq M, I + sarq $3, I + jle .L75 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L73 + ALIGN_4 + +.L72: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L72 + ALIGN_4 + +.L73: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L75: + testq $4, M + jle .L76 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L76: + testq $2, M + jle .L77 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L77: + testq $1, M + je .L78 + + vmovsd -16 * SIZE(X1), %xmm12 + vmovsd -16 * SIZE(A1), %xmm4 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + ALIGN_4 + +.L78: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vaddpd %xmm1, %xmm0 , %xmm0 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + vmulsd ALPHA, %xmm0 , %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + + vaddsd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 +#endif + ALIGN_4 + +.L999: + leaq (, M, SIZE), %rax + addq %rax,AA + jmp .L0x; + ALIGN_4 + +.L999x: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_4 + + EPILOGUE From 89405a1a0bda4eb63873dca319964815387f8a68 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 19 Jun 2013 19:31:38 +0200 Subject: [PATCH 2/5] cleanup of dgemm_ncopy_8_bulldozer.S --- kernel/x86_64/dgemm_ncopy_8_bulldozer.S | 606 ++++++++++++------------ 1 file changed, 302 insertions(+), 304 deletions(-) diff --git a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S index 26a14b76a..1b934f6bb 100644 --- a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S +++ b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S @@ -128,20 +128,20 @@ testq $SIZE, A je .L12 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO1, LDA, 2), %xmm2 - movsd 0 * SIZE(AO1, LDA3), %xmm3 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovsd 0 * SIZE(AO1, LDA3), %xmm3 - movsd 0 * SIZE(AO2), %xmm4 - movsd 0 * SIZE(AO2, LDA), %xmm5 - movsd 0 * SIZE(AO2, LDA, 2), %xmm6 - movsd 0 * SIZE(AO2, LDA3), %xmm7 + vmovsd 0 * SIZE(AO2), %xmm4 + vmovsd 0 * SIZE(AO2, LDA), %xmm5 + vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovsd 0 * SIZE(AO2, LDA3), %xmm7 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 - unpcklpd %xmm5, %xmm4 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 + vunpcklpd %xmm5, %xmm4 , %xmm4 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -172,9 +172,9 @@ vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups %xmm0, %xmm8 prefetchnta A_PRE(AO1, LDA3) - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 prefetchnta A_PRE(AO2) @@ -186,10 +186,10 @@ prefetchnta A_PRE(AO2, LDA, 2) vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 prefetchnta A_PRE(AO2, LDA3) vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE(B) vmovups %xmm0, -16 * SIZE(B) @@ -197,10 +197,10 @@ vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+64(B) @@ -217,9 +217,9 @@ vmovups 2 * SIZE(AO1, LDA3), %xmm3 vmovups %xmm0, %xmm8 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups 2 * SIZE(AO2), %xmm4 @@ -228,9 +228,9 @@ vmovups 2 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE+128(B) @@ -239,10 +239,10 @@ vmovups %xmm4, 4 * SIZE(B) vmovups %xmm6, 6 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+192(B) @@ -259,9 +259,9 @@ vmovups 4 * SIZE(AO1, LDA3), %xmm3 vmovups %xmm0, %xmm8 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups 4 * SIZE(AO2), %xmm4 @@ -270,9 +270,9 @@ vmovups 4 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE+256(B) @@ -281,11 +281,10 @@ vmovups %xmm4, 20 * SIZE(B) vmovups %xmm6, 22 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 - + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+320(B) vmovups %xmm8, 24 * SIZE(B) @@ -301,9 +300,9 @@ vmovups 6 * SIZE(AO1, LDA3), %xmm3 vmovups %xmm0, %xmm8 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups 6 * SIZE(AO2), %xmm4 @@ -312,9 +311,9 @@ vmovups 6 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE+384(B) @@ -323,11 +322,10 @@ vmovups %xmm4, 36 * SIZE(B) vmovups %xmm6, 38 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 - + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+448(B) vmovups %xmm8, 40 * SIZE(B) @@ -358,24 +356,24 @@ vmovups 0 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm0, %xmm8 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 vmovups %xmm8, -8 * SIZE(B) vmovups %xmm9, -6 * SIZE(B) @@ -393,24 +391,24 @@ vmovups 2 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm0, %xmm8 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, 0 * SIZE(B) vmovups %xmm2, 2 * SIZE(B) vmovups %xmm4, 4 * SIZE(B) vmovups %xmm6, 6 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 vmovups %xmm8, 8 * SIZE(B) vmovups %xmm9, 10 * SIZE(B) @@ -437,24 +435,24 @@ vmovups 0 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm0, %xmm8 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm4, %xmm10 - unpcklpd %xmm5, %xmm4 + vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) - unpckhpd %xmm1, %xmm8 - unpckhpd %xmm3, %xmm9 - unpckhpd %xmm5, %xmm10 - unpckhpd %xmm7, %xmm11 + vunpckhpd %xmm1, %xmm8 , %xmm8 + vunpckhpd %xmm3, %xmm9 , %xmm9 + vunpckhpd %xmm5, %xmm10, %xmm10 + vunpckhpd %xmm7, %xmm11, %xmm11 vmovups %xmm8, -8 * SIZE(B) vmovups %xmm9, -6 * SIZE(B) @@ -470,20 +468,20 @@ testq $1, MM jle .L19 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO1, LDA, 2), %xmm2 - movsd 0 * SIZE(AO1, LDA3), %xmm3 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovsd 0 * SIZE(AO1, LDA3), %xmm3 - movsd 0 * SIZE(AO2), %xmm4 - movsd 0 * SIZE(AO2, LDA), %xmm5 - movsd 0 * SIZE(AO2, LDA, 2), %xmm6 - movsd 0 * SIZE(AO2, LDA3), %xmm7 + vmovsd 0 * SIZE(AO2), %xmm4 + vmovsd 0 * SIZE(AO2, LDA), %xmm5 + vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovsd 0 * SIZE(AO2, LDA3), %xmm7 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 - unpcklpd %xmm5, %xmm4 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 + vunpcklpd %xmm5, %xmm4 , %xmm4 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -509,13 +507,13 @@ testq $SIZE, A je .L22 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO2), %xmm2 - movsd 0 * SIZE(AO2, LDA), %xmm3 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO2), %xmm2 + vmovsd 0 * SIZE(AO2, LDA), %xmm3 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -539,12 +537,12 @@ vmovups 0 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) @@ -559,12 +557,12 @@ vmovups 2 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) @@ -579,12 +577,12 @@ vmovups 4 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, 0 * SIZE(B) @@ -599,12 +597,12 @@ vmovups 6 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, 8 * SIZE(B) @@ -630,12 +628,12 @@ vmovups 0 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -648,12 +646,12 @@ vmovups 2 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) @@ -675,12 +673,12 @@ vmovups 0 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -696,13 +694,13 @@ testq $1, MM jle .L30 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO2), %xmm2 - movsd 0 * SIZE(AO2, LDA), %xmm3 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO2), %xmm2 + vmovsd 0 * SIZE(AO2, LDA), %xmm3 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -720,10 +718,10 @@ testq $SIZE, A je .L32 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO2), %xmm1 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO2), %xmm1 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) @@ -746,12 +744,12 @@ vmovups 2 * SIZE(AO2), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) @@ -766,12 +764,12 @@ vmovups 6 * SIZE(AO2), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm3, %xmm2 , %xmm2 - unpckhpd %xmm1, %xmm4 - unpckhpd %xmm3, %xmm6 + vunpckhpd %xmm1, %xmm4 , %xmm4 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) @@ -797,12 +795,12 @@ vmovups 2 * SIZE(AO2), %xmm3 vmovups %xmm0, %xmm4 - unpcklpd %xmm1, %xmm0 - unpckhpd %xmm1, %xmm4 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpckhpd %xmm1, %xmm4 , %xmm4 vmovups %xmm2, %xmm6 - unpcklpd %xmm3, %xmm2 - unpckhpd %xmm3, %xmm6 + vunpcklpd %xmm3, %xmm2 , %xmm2 + vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm4, -14 * SIZE(B) @@ -822,8 +820,8 @@ vmovups 0 * SIZE(AO2), %xmm1 vmovups %xmm0, %xmm2 - unpcklpd %xmm1, %xmm0 - unpckhpd %xmm1, %xmm2 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpckhpd %xmm1, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -837,10 +835,10 @@ testq $1, MM jle .L40 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO2), %xmm1 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO2), %xmm1 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B @@ -910,9 +908,9 @@ testq $1, MM jle .L999 - movsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1), %xmm0 - movlpd %xmm0, -16 * SIZE(B) + vmovlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 @@ -931,10 +929,10 @@ vmovups 5 * SIZE(AO1), %xmm3 vmovups 7 * SIZE(AO1), %xmm4 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm2, %xmm1 - shufpd $1, %xmm3, %xmm2 - shufpd $1, %xmm4, %xmm3 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm2, %xmm1 , %xmm1 + vshufpd $1, %xmm3, %xmm2 , %xmm2 + vshufpd $1, %xmm4, %xmm3 , %xmm3 vmovups %xmm0, -16 * SIZE(B) @@ -958,8 +956,8 @@ vmovups 1 * SIZE(AO1), %xmm1 vmovups 3 * SIZE(AO1), %xmm2 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm2, %xmm1 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm2, %xmm1 , %xmm1 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) @@ -976,7 +974,7 @@ vmovups 1 * SIZE(AO1), %xmm1 - shufpd $1, %xmm1, %xmm0 + vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) @@ -990,9 +988,9 @@ testq $1, M jle .L999 - shufpd $1, %xmm0, %xmm0 + vshufpd $1, %xmm0, %xmm0 , %xmm0 - movlpd %xmm0, -16 * SIZE(B) + vmovlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 @@ -1010,19 +1008,19 @@ testq $SIZE, A je .L52 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO1, LDA, 2), %xmm2 - movsd 0 * SIZE(AO1, LDA3), %xmm3 - movsd 0 * SIZE(AO2), %xmm4 - movsd 0 * SIZE(AO2, LDA), %xmm5 - movsd 0 * SIZE(AO2, LDA, 2), %xmm6 - movsd 0 * SIZE(AO2, LDA3), %xmm7 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovsd 0 * SIZE(AO1, LDA3), %xmm3 + vmovsd 0 * SIZE(AO2), %xmm4 + vmovsd 0 * SIZE(AO2, LDA), %xmm5 + vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovsd 0 * SIZE(AO2, LDA3), %xmm7 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 - unpcklpd %xmm5, %xmm4 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 + vunpcklpd %xmm5, %xmm4 , %xmm4 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -1058,10 +1056,10 @@ vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 1 * SIZE(AO2, LDA3), %xmm7 - movsd %xmm0, %xmm9 - movsd %xmm2, %xmm10 - movsd %xmm4, %xmm11 - movsd %xmm6, %xmm12 + vmovsd %xmm0, %xmm9 , %xmm9 + vmovsd %xmm2, %xmm10, %xmm10 + vmovsd %xmm4, %xmm11, %xmm11 + vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, -16 * SIZE(B) @@ -1069,10 +1067,10 @@ vmovups %xmm11, -12 * SIZE(B) vmovups %xmm12, -10 * SIZE(B) - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 - shufpd $1, %xmm5, %xmm4 - shufpd $1, %xmm7, %xmm6 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 + vshufpd $1, %xmm5, %xmm4 , %xmm4 + vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) @@ -1092,10 +1090,10 @@ vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 vmovups 3 * SIZE(AO2, LDA3), %xmm12 - movsd %xmm0, %xmm1 - movsd %xmm2, %xmm3 - movsd %xmm4, %xmm5 - movsd %xmm6, %xmm7 + vmovsd %xmm0, %xmm1 , %xmm1 + vmovsd %xmm2, %xmm3 , %xmm3 + vmovsd %xmm4, %xmm5 , %xmm5 + vmovsd %xmm6, %xmm7 , %xmm7 vmovups %xmm1, 0 * SIZE(B) @@ -1103,10 +1101,10 @@ vmovups %xmm5, 4 * SIZE(B) vmovups %xmm7, 6 * SIZE(B) - shufpd $1, %xmm9, %xmm0 - shufpd $1, %xmm10, %xmm2 - shufpd $1, %xmm11, %xmm4 - shufpd $1, %xmm12, %xmm6 + vshufpd $1, %xmm9, %xmm0 , %xmm0 + vshufpd $1, %xmm10, %xmm2 , %xmm2 + vshufpd $1, %xmm11, %xmm4 , %xmm4 + vshufpd $1, %xmm12, %xmm6 , %xmm6 vmovups %xmm0, 8 * SIZE(B) @@ -1126,10 +1124,10 @@ vmovups 4 * SIZE(AO2, LDA, 2), %xmm6 vmovups 5 * SIZE(AO2, LDA3), %xmm7 - movsd %xmm0, %xmm9 - movsd %xmm2, %xmm10 - movsd %xmm4, %xmm11 - movsd %xmm6, %xmm12 + vmovsd %xmm0, %xmm9 , %xmm9 + vmovsd %xmm2, %xmm10, %xmm10 + vmovsd %xmm4, %xmm11, %xmm11 + vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, 16 * SIZE(B) @@ -1137,10 +1135,10 @@ vmovups %xmm11, 20 * SIZE(B) vmovups %xmm12, 22 * SIZE(B) - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 - shufpd $1, %xmm5, %xmm4 - shufpd $1, %xmm7, %xmm6 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 + vshufpd $1, %xmm5, %xmm4 , %xmm4 + vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, 24 * SIZE(B) @@ -1160,10 +1158,10 @@ vmovups 6 * SIZE(AO2, LDA, 2), %xmm6 vmovups 7 * SIZE(AO2, LDA3), %xmm12 - movsd %xmm0, %xmm1 - movsd %xmm2, %xmm3 - movsd %xmm4, %xmm5 - movsd %xmm6, %xmm7 + vmovsd %xmm0, %xmm1 , %xmm1 + vmovsd %xmm2, %xmm3 , %xmm3 + vmovsd %xmm4, %xmm5 , %xmm5 + vmovsd %xmm6, %xmm7 , %xmm7 vmovups %xmm1, 32 * SIZE(B) @@ -1171,10 +1169,10 @@ vmovups %xmm5, 36 * SIZE(B) vmovups %xmm7, 38 * SIZE(B) - shufpd $1, %xmm9, %xmm0 - shufpd $1, %xmm10, %xmm2 - shufpd $1, %xmm11, %xmm4 - shufpd $1, %xmm12, %xmm6 + vshufpd $1, %xmm9, %xmm0 , %xmm0 + vshufpd $1, %xmm10, %xmm2 , %xmm2 + vshufpd $1, %xmm11, %xmm4 , %xmm4 + vshufpd $1, %xmm12, %xmm6 , %xmm6 vmovups %xmm0, 40 * SIZE(B) vmovups %xmm2, 42 * SIZE(B) @@ -1202,20 +1200,20 @@ vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 1 * SIZE(AO2, LDA3), %xmm7 - movsd %xmm0, %xmm9 - movsd %xmm2, %xmm10 - movsd %xmm4, %xmm11 - movsd %xmm6, %xmm12 + vmovsd %xmm0, %xmm9 , %xmm9 + vmovsd %xmm2, %xmm10, %xmm10 + vmovsd %xmm4, %xmm11, %xmm11 + vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, -16 * SIZE(B) vmovups %xmm10, -14 * SIZE(B) vmovups %xmm11, -12 * SIZE(B) vmovups %xmm12, -10 * SIZE(B) - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 - shufpd $1, %xmm5, %xmm4 - shufpd $1, %xmm7, %xmm6 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 + vshufpd $1, %xmm5, %xmm4 , %xmm4 + vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) @@ -1231,20 +1229,20 @@ vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 vmovups 3 * SIZE(AO2, LDA3), %xmm12 - movsd %xmm0, %xmm1 - movsd %xmm2, %xmm3 - movsd %xmm4, %xmm5 - movsd %xmm6, %xmm7 + vmovsd %xmm0, %xmm1 , %xmm1 + vmovsd %xmm2, %xmm3 , %xmm3 + vmovsd %xmm4, %xmm5 , %xmm5 + vmovsd %xmm6, %xmm7 , %xmm7 vmovups %xmm1, 0 * SIZE(B) vmovups %xmm3, 2 * SIZE(B) vmovups %xmm5, 4 * SIZE(B) vmovups %xmm7, 6 * SIZE(B) - shufpd $1, %xmm9, %xmm0 - shufpd $1, %xmm10, %xmm2 - shufpd $1, %xmm11, %xmm4 - shufpd $1, %xmm12, %xmm6 + vshufpd $1, %xmm9, %xmm0 , %xmm0 + vshufpd $1, %xmm10, %xmm2 , %xmm2 + vshufpd $1, %xmm11, %xmm4 , %xmm4 + vshufpd $1, %xmm12, %xmm6 , %xmm6 vmovups %xmm0, 8 * SIZE(B) vmovups %xmm2, 10 * SIZE(B) @@ -1269,20 +1267,20 @@ vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 1 * SIZE(AO2, LDA3), %xmm7 - movsd %xmm0, %xmm9 - movsd %xmm2, %xmm10 - movsd %xmm4, %xmm11 - movsd %xmm6, %xmm12 + vmovsd %xmm0, %xmm9 , %xmm9 + vmovsd %xmm2, %xmm10, %xmm10 + vmovsd %xmm4, %xmm11, %xmm11 + vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, -16 * SIZE(B) vmovups %xmm10, -14 * SIZE(B) vmovups %xmm11, -12 * SIZE(B) vmovups %xmm12, -10 * SIZE(B) - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 - shufpd $1, %xmm5, %xmm4 - shufpd $1, %xmm7, %xmm6 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 + vshufpd $1, %xmm5, %xmm4 , %xmm4 + vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) @@ -1298,19 +1296,19 @@ testq $1, MM jle .L59 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO1, LDA, 2), %xmm2 - movsd 0 * SIZE(AO1, LDA3), %xmm3 - movsd 0 * SIZE(AO2), %xmm4 - movsd 0 * SIZE(AO2, LDA), %xmm5 - movsd 0 * SIZE(AO2, LDA, 2), %xmm6 - movsd 0 * SIZE(AO2, LDA3), %xmm7 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovsd 0 * SIZE(AO1, LDA3), %xmm3 + vmovsd 0 * SIZE(AO2), %xmm4 + vmovsd 0 * SIZE(AO2, LDA), %xmm5 + vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovsd 0 * SIZE(AO2, LDA3), %xmm7 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 - unpcklpd %xmm5, %xmm4 - unpcklpd %xmm7, %xmm6 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 + vunpcklpd %xmm5, %xmm4 , %xmm4 + vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -1336,13 +1334,13 @@ testq $SIZE, A je .L62 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO2), %xmm2 - movsd 0 * SIZE(AO2, LDA), %xmm3 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO2), %xmm2 + vmovsd 0 * SIZE(AO2, LDA), %xmm3 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -1368,10 +1366,10 @@ vmovups 0 * SIZE(AO2), %xmm2 vmovups 1 * SIZE(AO2, LDA), %xmm3 - movsd %xmm0, %xmm5 - movsd %xmm2, %xmm7 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 + vmovsd %xmm0, %xmm5 , %xmm5 + vmovsd %xmm2, %xmm7 , %xmm7 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) @@ -1385,10 +1383,10 @@ vmovups 2 * SIZE(AO2), %xmm2 vmovups 3 * SIZE(AO2, LDA), %xmm7 - movsd %xmm0, %xmm1 - movsd %xmm2, %xmm3 - shufpd $1, %xmm5, %xmm0 - shufpd $1, %xmm7, %xmm2 + vmovsd %xmm0, %xmm1 , %xmm1 + vmovsd %xmm2, %xmm3 , %xmm3 + vshufpd $1, %xmm5, %xmm0 , %xmm0 + vshufpd $1, %xmm7, %xmm2 , %xmm2 vmovups %xmm1, -8 * SIZE(B) @@ -1402,10 +1400,10 @@ vmovups 4 * SIZE(AO2), %xmm2 vmovups 5 * SIZE(AO2, LDA), %xmm3 - movsd %xmm0, %xmm5 - movsd %xmm2, %xmm7 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 + vmovsd %xmm0, %xmm5 , %xmm5 + vmovsd %xmm2, %xmm7 , %xmm7 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, 0 * SIZE(B) @@ -1419,10 +1417,10 @@ vmovups 6 * SIZE(AO2), %xmm2 vmovups 7 * SIZE(AO2, LDA), %xmm7 - movsd %xmm0, %xmm1 - movsd %xmm2, %xmm3 - shufpd $1, %xmm5, %xmm0 - shufpd $1, %xmm7, %xmm2 + vmovsd %xmm0, %xmm1 , %xmm1 + vmovsd %xmm2, %xmm3 , %xmm3 + vshufpd $1, %xmm5, %xmm0 , %xmm0 + vshufpd $1, %xmm7, %xmm2 , %xmm2 vmovups %xmm1, 8 * SIZE(B) @@ -1447,10 +1445,10 @@ vmovups 0 * SIZE(AO2), %xmm2 vmovups 1 * SIZE(AO2, LDA), %xmm3 - movsd %xmm0, %xmm5 - shufpd $1, %xmm1, %xmm0 - movsd %xmm2, %xmm7 - shufpd $1, %xmm3, %xmm2 + vmovsd %xmm0, %xmm5 , %xmm5 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vmovsd %xmm2, %xmm7 , %xmm7 + vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm7, -14 * SIZE(B) @@ -1462,10 +1460,10 @@ vmovups 2 * SIZE(AO2), %xmm2 vmovups 3 * SIZE(AO2, LDA), %xmm7 - movsd %xmm0, %xmm1 - shufpd $1, %xmm5, %xmm0 - movsd %xmm2, %xmm3 - shufpd $1, %xmm7, %xmm2 + vmovsd %xmm0, %xmm1 , %xmm1 + vshufpd $1, %xmm5, %xmm0 , %xmm0 + vmovsd %xmm2, %xmm3 , %xmm3 + vshufpd $1, %xmm7, %xmm2 , %xmm2 vmovups %xmm1, -8 * SIZE(B) vmovups %xmm3, -6 * SIZE(B) @@ -1486,10 +1484,10 @@ vmovups 0 * SIZE(AO2), %xmm2 vmovups 1 * SIZE(AO2, LDA), %xmm3 - movsd %xmm0, %xmm5 - movsd %xmm2, %xmm7 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm3, %xmm2 + vmovsd %xmm0, %xmm5 , %xmm5 + vmovsd %xmm2, %xmm7 , %xmm7 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm7, -14 * SIZE(B) @@ -1505,13 +1503,13 @@ testq $1, MM jle .L70 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO1, LDA), %xmm1 - movsd 0 * SIZE(AO2), %xmm2 - movsd 0 * SIZE(AO2, LDA), %xmm3 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO2), %xmm2 + vmovsd 0 * SIZE(AO2, LDA), %xmm3 - unpcklpd %xmm1, %xmm0 - unpcklpd %xmm3, %xmm2 + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) @@ -1529,10 +1527,10 @@ testq $SIZE, A je .L72 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO2), %xmm1 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO2), %xmm1 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) @@ -1556,10 +1554,10 @@ vmovups 2 * SIZE(AO1), %xmm2 vmovups 3 * SIZE(AO2), %xmm3 - movsd %xmm0, %xmm5 - shufpd $1, %xmm1, %xmm0 - movsd %xmm2, %xmm1 - shufpd $1, %xmm3, %xmm2 + vmovsd %xmm0, %xmm5 , %xmm5 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vmovsd %xmm2, %xmm1 , %xmm1 + vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) @@ -1573,10 +1571,10 @@ vmovups 6 * SIZE(AO1), %xmm2 vmovups 7 * SIZE(AO2), %xmm5 - movsd %xmm0, %xmm3 - shufpd $1, %xmm1, %xmm0 - movsd %xmm2, %xmm1 - shufpd $1, %xmm5, %xmm2 + vmovsd %xmm0, %xmm3 , %xmm3 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vmovsd %xmm2, %xmm1 , %xmm1 + vshufpd $1, %xmm5, %xmm2 , %xmm2 vmovups %xmm3, -8 * SIZE(B) @@ -1601,10 +1599,10 @@ vmovups 2 * SIZE(AO1), %xmm2 vmovups 3 * SIZE(AO2), %xmm3 - movsd %xmm0, %xmm5 - shufpd $1, %xmm1, %xmm0 - movsd %xmm2, %xmm1 - shufpd $1, %xmm3, %xmm2 + vmovsd %xmm0, %xmm5 , %xmm5 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vmovsd %xmm2, %xmm1 , %xmm1 + vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm0, -14 * SIZE(B) @@ -1625,8 +1623,8 @@ vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO2), %xmm1 - movsd %xmm0, %xmm5 - shufpd $1, %xmm1, %xmm0 + vmovsd %xmm0, %xmm5 , %xmm5 + vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm0, -14 * SIZE(B) @@ -1640,10 +1638,10 @@ testq $1, MM jle .L80 - movsd 0 * SIZE(AO1), %xmm0 - movsd 0 * SIZE(AO2), %xmm1 + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO2), %xmm1 - unpcklpd %xmm1, %xmm0 + vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B @@ -1713,9 +1711,9 @@ testq $1, MM jle .L999 - movsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1), %xmm0 - movlpd %xmm0, -16 * SIZE(B) + vmovlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 @@ -1734,10 +1732,10 @@ vmovups 5 * SIZE(AO1), %xmm3 vmovups 7 * SIZE(AO1), %xmm4 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm2, %xmm1 - shufpd $1, %xmm3, %xmm2 - shufpd $1, %xmm4, %xmm3 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm2, %xmm1 , %xmm1 + vshufpd $1, %xmm3, %xmm2 , %xmm2 + vshufpd $1, %xmm4, %xmm3 , %xmm3 vmovups %xmm0, -16 * SIZE(B) @@ -1761,8 +1759,8 @@ vmovups 1 * SIZE(AO1), %xmm1 vmovups 3 * SIZE(AO1), %xmm2 - shufpd $1, %xmm1, %xmm0 - shufpd $1, %xmm2, %xmm1 + vshufpd $1, %xmm1, %xmm0 , %xmm0 + vshufpd $1, %xmm2, %xmm1 , %xmm1 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) @@ -1779,7 +1777,7 @@ vmovups 1 * SIZE(AO1), %xmm1 - shufpd $1, %xmm1, %xmm0 + vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) @@ -1793,9 +1791,9 @@ testq $1, M jle .L999 - shufpd $1, %xmm0, %xmm0 + vshufpd $1, %xmm0, %xmm0 , %xmm0 - movlpd %xmm0, -16 * SIZE(B) + vmovlpd %xmm0, -16 * SIZE(B) ALIGN_4 .L999: From 8dc0c72583eb60ec5e8272049e35fd34af409a13 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 20 Jun 2013 14:07:54 +0200 Subject: [PATCH 3/5] added daxpy_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/daxpy_bulldozer.S | 408 ++++++++++++++++++++++++++++++++ 2 files changed, 409 insertions(+) create mode 100644 kernel/x86_64/daxpy_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index f481f3376..2afeec014 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -3,6 +3,7 @@ ZGEMVTKERNEL = zgemv_t_dup.S DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S +DAXPYKERNEL = daxpy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/daxpy_bulldozer.S b/kernel/x86_64/daxpy_bulldozer.S new file mode 100644 index 000000000..dfc10e80f --- /dev/null +++ b/kernel/x86_64/daxpy_bulldozer.S @@ -0,0 +1,408 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#define A_PRE 640 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + vmovups %xmm0, ALPHA +#else + vmovups %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + unpcklpd ALPHA, ALPHA + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L47 + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + + testq $SIZE, Y + je .L10 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + movq M, %rax + sarq $4, %rax + jle .L13 + + vmovups -16 * SIZE(X), %xmm0 + vmovups -14 * SIZE(X), %xmm1 + vmovups -12 * SIZE(X), %xmm2 + vmovups -10 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + + prefetchnta A_PRE(Y) + + vmovups -8 * SIZE(X), %xmm4 + vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 + vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 + vmovups -6 * SIZE(X), %xmm5 + vmovups -4 * SIZE(X), %xmm6 + vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 + vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 + vmovups -2 * SIZE(X), %xmm7 + + + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + prefetchnta A_PRE(X) + nop + vmovups %xmm2, -12 * SIZE(Y) + vmovups %xmm3, -10 * SIZE(Y) + + prefetchnta A_PRE+64(Y) + + vmovups 0 * SIZE(X), %xmm0 + vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 + vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 + vmovups 2 * SIZE(X), %xmm1 + vmovups 4 * SIZE(X), %xmm2 + vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 + vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 + vmovups 6 * SIZE(X), %xmm3 + + + vmovups %xmm4, -8 * SIZE(Y) + vmovups %xmm5, -6 * SIZE(Y) + prefetchnta A_PRE+64(X) + nop + vmovups %xmm6, -4 * SIZE(Y) + vmovups %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + + vmovups -8 * SIZE(X), %xmm4 + vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 + vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 + vmovups -6 * SIZE(X), %xmm5 + vmovups -4 * SIZE(X), %xmm6 + vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 + vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 + vmovups -2 * SIZE(X), %xmm7 + + + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + vmovups %xmm2, -12 * SIZE(Y) + vmovups %xmm3, -10 * SIZE(Y) + + vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 + vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 + vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 + vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 + + vmovups %xmm4, -8 * SIZE(Y) + vmovups %xmm5, -6 * SIZE(Y) + vmovups %xmm6, -4 * SIZE(Y) + vmovups %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + + + movq M, %rax + andq $8, %rax + jle .L14 + ALIGN_3 + + vmovups -16 * SIZE(X), %xmm0 + vmovups -14 * SIZE(X), %xmm1 + vmovups -12 * SIZE(X), %xmm2 + vmovups -10 * SIZE(X), %xmm3 + + vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 + vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 + vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 + vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 + + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + vmovups %xmm2, -12 * SIZE(Y) + vmovups %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + movq M, %rax + andq $4, %rax + jle .L15 + ALIGN_3 + + vmovups -16 * SIZE(X), %xmm0 + vmovups -14 * SIZE(X), %xmm1 + + vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 + vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 + + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L16 + ALIGN_3 + + vmovups -16 * SIZE(X), %xmm0 + vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 + vmovups %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + vmovsd -16 * SIZE(X), %xmm0 + vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + +.L40: + movq Y, YY + movq M, %rax +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L46 + cmpq $0, INCY + je .L46 + + sarq $3, %rax + jle .L45 + + prefetchnta 512(X) + prefetchnta 512+64(X) + prefetchnta 512+128(X) + prefetchnta 512+192(X) + + prefetchnta 512(Y) + prefetchnta 512+64(Y) + prefetchnta 512+128(Y) + prefetchnta 512+192(Y) + ALIGN_3 + +.L41: + + vmovsd 0 * SIZE(X), %xmm0 + addq INCX, X + vmovhpd 0 * SIZE(X), %xmm0 , %xmm0 + addq INCX, X + + vmovsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6 + addq INCY, YY + + + vmovsd 0 * SIZE(X), %xmm1 + addq INCX, X + vmovhpd 0 * SIZE(X), %xmm1 , %xmm1 + addq INCX, X + + vmovsd 0 * SIZE(YY), %xmm7 + addq INCY, YY + vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7 + addq INCY, YY + + vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0 + + vmovsd 0 * SIZE(X), %xmm2 + addq INCX, X + vmovhpd 0 * SIZE(X), %xmm2 , %xmm2 + addq INCX, X + + vmovsd 0 * SIZE(YY), %xmm8 + addq INCY, YY + vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8 + addq INCY, YY + + vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1 + + vmovsd 0 * SIZE(X), %xmm3 + addq INCX, X + vmovhpd 0 * SIZE(X), %xmm3 , %xmm3 + addq INCX, X + + vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2 + + vmovsd 0 * SIZE(YY), %xmm9 + addq INCY, YY + vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9 + addq INCY, YY + + + vmovsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + vmovhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + vmovsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + vmovhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + vmovsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + vmovhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + + vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3 + + vmovsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + vmovhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + vmovsd (X), %xmm0 + addq INCX, X + + vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0 + + vmovsd %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE From bcbac31b477ce1902802d98e929d7ca7d9956646 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 20 Jun 2013 16:15:09 +0200 Subject: [PATCH 4/5] added ddot_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/ddot_bulldozer.S | 311 +++++++++++++++++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 kernel/x86_64/ddot_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 2afeec014..5dd6f6d8d 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -4,6 +4,7 @@ ZGEMVTKERNEL = zgemv_t_dup.S DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S DAXPYKERNEL = daxpy_bulldozer.S +DDOTKERNEL = ddot_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/ddot_bulldozer.S b/kernel/x86_64/ddot_bulldozer.S new file mode 100644 index 000000000..503ec60cf --- /dev/null +++ b/kernel/x86_64/ddot_bulldozer.S @@ -0,0 +1,311 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define A_PRE 512 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + vxorps %xmm0, %xmm0 , %xmm0 + vxorps %xmm1, %xmm1 , %xmm1 + vxorps %xmm2, %xmm2 , %xmm2 + vxorps %xmm3, %xmm3 , %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + je .L10 + + vmovsd -16 * SIZE(X), %xmm0 + vmulsd -16 * SIZE(Y), %xmm0 , %xmm0 + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + ALIGN_2 + +.L10: + + movq N, %rax + sarq $4, %rax + jle .L14 + + vmovups -16 * SIZE(X), %xmm4 + vmovups -14 * SIZE(X), %xmm5 + vmovups -12 * SIZE(X), %xmm6 + vmovups -10 * SIZE(X), %xmm7 + + vmovups -8 * SIZE(X), %xmm8 + vmovups -6 * SIZE(X), %xmm9 + vmovups -4 * SIZE(X), %xmm10 + vmovups -2 * SIZE(X), %xmm11 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: + prefetchnta A_PRE(Y) + + vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 + prefetchnta A_PRE(X) + vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 + + vmovups 0 * SIZE(X), %xmm4 + vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 + vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 + vmovups 2 * SIZE(X), %xmm5 + vmovups 4 * SIZE(X), %xmm6 + vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 + vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 + vmovups 6 * SIZE(X), %xmm7 + + prefetchnta A_PRE+64(Y) + + vmovups 8 * SIZE(X), %xmm8 + vmovups 10 * SIZE(X), %xmm9 + prefetchnta A_PRE+64(X) + vmovups 12 * SIZE(X), %xmm10 + vmovups 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + + vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 + + vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 + vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 + vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 + vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + vmovups -16 * SIZE(X), %xmm4 + vmovups -14 * SIZE(X), %xmm5 + vmovups -12 * SIZE(X), %xmm6 + vmovups -10 * SIZE(X), %xmm7 + + vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + vmovups -16 * SIZE(X), %xmm4 + vmovups -14 * SIZE(X), %xmm5 + + vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + vmovups -16 * SIZE(X), %xmm4 + vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 + + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + vmovsd -16 * SIZE(X), %xmm4 + vmovsd -16 * SIZE(Y), %xmm5 + vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0 + jmp .L999 + ALIGN_3 + + +.L50: + movq N, %rax + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L53: + + + vmovsd 0 * SIZE(X), %xmm4 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm8 + addq INCY, Y + vmovsd 0 * SIZE(X), %xmm5 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm9 + addq INCY, Y + + vmovsd 0 * SIZE(X), %xmm6 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm10 + addq INCY, Y + vmovsd 0 * SIZE(X), %xmm7 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm11 + addq INCY, Y + + vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 + vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 + vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 + vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 + + + vmovsd 0 * SIZE(X), %xmm4 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm8 + addq INCY, Y + vmovsd 0 * SIZE(X), %xmm5 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm9 + addq INCY, Y + + vmovsd 0 * SIZE(X), %xmm6 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm10 + addq INCY, Y + vmovsd 0 * SIZE(X), %xmm7 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm11 + addq INCY, Y + + vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 + vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 + vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 + vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L56: + vmovsd 0 * SIZE(X), %xmm4 + addq INCX, X + vmovsd 0 * SIZE(Y), %xmm8 + addq INCY, Y + + vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + vaddpd %xmm1, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm2 , %xmm2 + vaddpd %xmm2, %xmm0 , %xmm0 + + vhaddpd %xmm0, %xmm0 , %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE From 16012767f4f3b7595fb6039e56c85b325774b426 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 21 Jun 2013 16:06:51 +0200 Subject: [PATCH 5/5] added dcopy_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 5 +- kernel/x86_64/dcopy_bulldozer.S | 291 ++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dcopy_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 5dd6f6d8d..8ebd42244 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -3,8 +3,9 @@ ZGEMVTKERNEL = zgemv_t_dup.S DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S -DAXPYKERNEL = daxpy_bulldozer.S -DDOTKERNEL = ddot_bulldozer.S +DAXPYKERNEL = daxpy_bulldozer.S +DDOTKERNEL = ddot_bulldozer.S +DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/dcopy_bulldozer.S b/kernel/x86_64/dcopy_bulldozer.S new file mode 100644 index 000000000..87f1a4e31 --- /dev/null +++ b/kernel/x86_64/dcopy_bulldozer.S @@ -0,0 +1,291 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG +#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2 +#define A_PRE 640 +#define B_PRE 640 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + + testq $SIZE, X + je .L10 + + vmovsd (X), %xmm0 + vmovsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + + movq M, %rax + sarq $4, %rax + jle .L13 + + vmovups -16 * SIZE(X), %xmm0 + vmovups -14 * SIZE(X), %xmm1 + vmovups -12 * SIZE(X), %xmm2 + vmovups -10 * SIZE(X), %xmm3 + vmovups -8 * SIZE(X), %xmm4 + vmovups -6 * SIZE(X), %xmm5 + vmovups -4 * SIZE(X), %xmm6 + vmovups -2 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_4 + +.L11: + + prefetchnta A_PRE(X) + nop + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + prefetchnta B_PRE(Y) + nop + vmovups %xmm2, -12 * SIZE(Y) + vmovups %xmm3, -10 * SIZE(Y) + + VLOAD( 0 * SIZE, X, %xmm0) + VLOAD( 2 * SIZE, X, %xmm1) + VLOAD( 4 * SIZE, X, %xmm2) + VLOAD( 6 * SIZE, X, %xmm3) + + prefetchnta A_PRE+64(X) + nop + vmovups %xmm4, -8 * SIZE(Y) + vmovups %xmm5, -6 * SIZE(Y) + prefetchnta B_PRE+64(Y) + nop + vmovups %xmm6, -4 * SIZE(Y) + vmovups %xmm7, -2 * SIZE(Y) + + VLOAD( 8 * SIZE, X, %xmm4) + VLOAD(10 * SIZE, X, %xmm5) + subq $-16 * SIZE, Y + VLOAD(12 * SIZE, X, %xmm6) + VLOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + vmovups %xmm2, -12 * SIZE(Y) + vmovups %xmm3, -10 * SIZE(Y) + vmovups %xmm4, -8 * SIZE(Y) + vmovups %xmm5, -6 * SIZE(Y) + vmovups %xmm6, -4 * SIZE(Y) + vmovups %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + ALIGN_3 + + vmovups -16 * SIZE(X), %xmm0 + vmovups -14 * SIZE(X), %xmm1 + vmovups -12 * SIZE(X), %xmm2 + vmovups -10 * SIZE(X), %xmm3 + + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + vmovups %xmm2, -12 * SIZE(Y) + vmovups %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + ALIGN_3 + + vmovups -16 * SIZE(X), %xmm0 + vmovups -14 * SIZE(X), %xmm1 + + vmovups %xmm0, -16 * SIZE(Y) + vmovups %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + ALIGN_3 + + vmovups -16 * SIZE(X), %xmm0 + vmovups %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + ALIGN_3 + + vmovsd -16 * SIZE(X), %xmm0 + vmovsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + + +.L40: + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + vmovsd (X), %xmm0 + addq INCX, X + vmovsd (X), %xmm4 + addq INCX, X + vmovsd (X), %xmm1 + addq INCX, X + vmovsd (X), %xmm5 + addq INCX, X + vmovsd (X), %xmm2 + addq INCX, X + vmovsd (X), %xmm6 + addq INCX, X + vmovsd (X), %xmm3 + addq INCX, X + vmovsd (X), %xmm7 + addq INCX, X + + vmovsd %xmm0, (Y) + addq INCY, Y + vmovsd %xmm4, (Y) + addq INCY, Y + vmovsd %xmm1, (Y) + addq INCY, Y + vmovsd %xmm5, (Y) + addq INCY, Y + vmovsd %xmm2, (Y) + addq INCY, Y + vmovsd %xmm6, (Y) + addq INCY, Y + vmovsd %xmm3, (Y) + addq INCY, Y + vmovsd %xmm7, (Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + vmovsd (X), %xmm0 + addq INCX, X + vmovsd %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE