diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 59ae72ce2..f481f3376 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -2,6 +2,7 @@ ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t_dup.S DGEMVNKERNEL = dgemv_n_bulldozer.S +DGEMVTKERNEL = dgemv_t_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/dgemv_t_bulldozer.S b/kernel/x86_64/dgemv_t_bulldozer.S new file mode 100644 index 000000000..487ff77ad --- /dev/null +++ b/kernel/x86_64/dgemv_t_bulldozer.S @@ -0,0 +1,1938 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +// #undef ALIGNED_ACCESS + +#define A_PRE 256 + +#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS +#define VMOVUPS_XL1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS + + + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +//Temp variables for M,N,A,LDA +#define MMM 224(%rsp) +#define NN 232(%rsp) +#define AA 240(%rsp) +#define LDAX 248(%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp +#define X1 %r15 + +#ifdef ALIGNED_ACCESS +#define MM INCX +#else +#define MM M +#endif + +#define ALPHA %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movq M, MMM + movq N, NN + movq A, AA + movq LDA, LDAX + +#else + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX +#endif +#ifdef HAVE_SSE3 +#ifndef WINDOWS_ABI + movddup %xmm0, ALPHA +#else + movddup %xmm3, ALPHA +#endif +#else +#ifndef WINDOWS_ABI + vmovups %xmm0, ALPHA +#else + vmovups %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA +#endif + + + +.L0x: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + + leaq -1(INCX), %rax + + leaq (,LDA, SIZE), LDA + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + + movq BUFFER, X1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L01 + + vmovsd (X), %xmm0 + addq INCX, X + + vmovsd %xmm0, 1 * SIZE(BUFFER) + addq $1 * SIZE, BUFFER + addq $2 * SIZE, X1 + decq M + jle .L10 + ALIGN_4 + +.L01: +#endif + + movq M, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + vmovsd (X), %xmm0 + addq INCX, X + vmovhpd (X), %xmm0 , %xmm0 + addq INCX, X + + vmovsd (X), %xmm1 + addq INCX, X + vmovhpd (X), %xmm1 , %xmm1 + addq INCX, X + + vmovsd (X), %xmm2 + addq INCX, X + vmovhpd (X), %xmm2 , %xmm2 + addq INCX, X + + vmovsd (X), %xmm3 + addq INCX, X + vmovhpd (X), %xmm3 , %xmm3 + addq INCX, X + + vmovups %xmm0, 0 * SIZE(X1) + vmovups %xmm1, 2 * SIZE(X1) + vmovups %xmm2, 4 * SIZE(X1) + vmovups %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + vmovsd (X), %xmm0 + addq INCX, X + vmovsd %xmm0, 0 * SIZE(X1) + addq $SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 4), A2 + leaq (A1, LDA, 8), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + vxorps %xmm4 , %xmm4, %xmm4 + vxorps %xmm5 , %xmm5, %xmm5 + vxorps %xmm6 , %xmm6, %xmm6 + vxorps %xmm7 , %xmm7, %xmm7 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A1, LDA3), %xmm11 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + vmovsd -16 * SIZE(A2), %xmm8 + vmovsd -16 * SIZE(A2, LDA), %xmm9 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A2, LDA3), %xmm11 + + vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4 + vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5 + vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6 + vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7 + + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L1X: +#endif + + movq M, I + sarq $3, I + jle .L15 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L13 + ALIGN_4 + +.L12: + + prefetchnta A_PRE(A1) + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + prefetchnta A_PRE(A1,LDA,1) + prefetchnta A_PRE(A2,LDA,1) + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + prefetchnta A_PRE(A1,LDA,2) + prefetchnta A_PRE(A2,LDA,2) + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + prefetchnta A_PRE(A1,LDA3,1) + prefetchnta A_PRE(A2,LDA3,1) + vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -12 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -12 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -12 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -10 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -10 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -10 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L12 + ALIGN_4 + +.L13: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -12 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -12 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -12 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -10 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -10 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -10 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L15: + testq $4, M + jle .L16 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 + + vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 + vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 + vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 + vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L16: + testq $2, M + jle .L17 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 + vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 + vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 + vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L17: + testq $1, M + je .L18 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A1, LDA3), %xmm11 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + vmovsd -16 * SIZE(A2), %xmm8 + vmovsd -16 * SIZE(A2, LDA), %xmm9 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm10 + vmovsd -16 * SIZE(A2, LDA3), %xmm11 + + vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4 + vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5 + vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6 + vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7 + + ALIGN_4 + +.L18: + vhaddpd %xmm1, %xmm0 , %xmm0 + vhaddpd %xmm3, %xmm2 , %xmm2 + vhaddpd %xmm5, %xmm4 , %xmm4 + vhaddpd %xmm7, %xmm6 , %xmm6 + + vmulpd ALPHA, %xmm0 , %xmm0 + vmulpd ALPHA, %xmm2 , %xmm2 + vmulpd ALPHA, %xmm4 , %xmm4 + vmulpd ALPHA, %xmm6 , %xmm6 + + cmpq $SIZE, INCY + jne .L19 + + vaddpd 0 * SIZE(Y), %xmm0 , %xmm0 + vaddpd 2 * SIZE(Y), %xmm2 , %xmm2 + vaddpd 4 * SIZE(Y), %xmm4 , %xmm4 + vaddpd 6 * SIZE(Y), %xmm6 , %xmm6 + addq $8 * SIZE, Y + + vmovups %xmm0, 0 * SIZE(Y1) + vmovups %xmm2, 2 * SIZE(Y1) + vmovups %xmm4, 4 * SIZE(Y1) + vmovups %xmm6, 6 * SIZE(Y1) + addq $8 * SIZE, Y1 + + cmpq $8, N + jge .L11 + jmp .L20 + ALIGN_4 + +.L19: + + vmovsd (Y), %xmm8 + addq INCY, Y + vmovhpd (Y), %xmm8 , %xmm8 + addq INCY, Y + vmovsd (Y), %xmm9 + addq INCY, Y + vmovhpd (Y), %xmm9 , %xmm9 + addq INCY, Y + vmovsd (Y), %xmm10 + addq INCY, Y + vmovhpd (Y), %xmm10 , %xmm10 + addq INCY, Y + vmovsd (Y), %xmm11 + addq INCY, Y + vmovhpd (Y), %xmm11 , %xmm11 + addq INCY, Y + + vaddpd %xmm8, %xmm0 , %xmm0 + vaddpd %xmm9, %xmm2 , %xmm2 + vaddpd %xmm10, %xmm4 , %xmm4 + vaddpd %xmm11, %xmm6 , %xmm6 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + vmovlpd %xmm2, (Y1) + addq INCY, Y1 + vmovhpd %xmm2, (Y1) + addq INCY, Y1 + vmovlpd %xmm4, (Y1) + addq INCY, Y1 + vmovhpd %xmm4, (Y1) + addq INCY, Y1 + vmovlpd %xmm6, (Y1) + addq INCY, Y1 + vmovhpd %xmm6, (Y1) + addq INCY, Y1 + + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A2), %xmm10 + vmovsd -16 * SIZE(A2, LDA), %xmm11 + + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L2X: +#endif + + movq M, I + sarq $3, I + jle .L25 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L23 + ALIGN_4 + +.L22: + + prefetchnta A_PRE(A1) + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + prefetchnta A_PRE(A1,LDA,1) + prefetchnta A_PRE(A2,LDA,1) + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L22 + ALIGN_4 + +.L23: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L25: + testq $4, M + jle .L26 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L26: + testq $2, M + jle .L27 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L27: + testq $1, M + je .L28 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A2), %xmm10 + vmovsd -16 * SIZE(A2, LDA), %xmm11 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 + + ALIGN_4 + +.L28: + vhaddpd %xmm1, %xmm0 , %xmm0 + vhaddpd %xmm3, %xmm2 , %xmm2 + + vmulpd ALPHA, %xmm0 , %xmm0 + vmulpd ALPHA, %xmm2 , %xmm2 + + cmpq $SIZE, INCY + jne .L29 + + vmovups 0 * SIZE(Y), %xmm4 + vmovups 2 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovups %xmm0, 0 * SIZE(Y1) + vmovups %xmm2, 2 * SIZE(Y1) + addq $4 * SIZE, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + jmp .L30 + ALIGN_4 + +.L29: + + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + vmovsd (Y), %xmm5 + addq INCY, Y + vmovhpd (Y), %xmm5 , %xmm5 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + vmovlpd %xmm2, (Y1) + addq INCY, Y1 + vmovhpd %xmm2, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + ALIGN_4 + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A2), %xmm9 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L3X: +#endif + + movq M, I + sarq $3, I + jle .L35 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L33 + ALIGN_4 + +.L32: + + prefetchnta A_PRE(A1) + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A2) , %xmm12 , %xmm1 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm13 , %xmm3 + + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L32 + ALIGN_4 + +.L33: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 + + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -12 * SIZE(A2) , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm13 , %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L35: + testq $4, M + jle .L36 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L36: + testq $2, M + jle .L37 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L37: + testq $1, M + je .L38 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A2), %xmm9 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 + + ALIGN_4 + +.L38: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + mulpd ALPHA, %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L31 +#endif + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + movsd -16 * SIZE(X1), %xmm12 + movsd -16 * SIZE(A1), %xmm8 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L4X: +#endif + + movq M, I + sarq $3, I + jle .L45 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L43 + ALIGN_4 + +.L42: + + prefetchnta A_PRE(A1) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + prefetchnta A_PRE(X1) + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L42 + ALIGN_4 + +.L43: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L45: + testq $4, M + jle .L46 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L46: + testq $2, M + jle .L47 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L47: + testq $1, M + je .L48 + + vmovsd -16 * SIZE(X1), %xmm12 + vmovsd -16 * SIZE(A1), %xmm8 + + vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 + ALIGN_4 + +.L48: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vaddpd %xmm1, %xmm0 , %xmm0 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + vmulsd ALPHA, %xmm0 , %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + + vaddsd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_4 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + + + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L5X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A2), %xmm6 + vmovsd -16 * SIZE(A2, LDA), %xmm7 + + vfmaddpd %xmm0, %xmm4 , %xmm12, %xmm0 + vfmaddpd %xmm1, %xmm5 , %xmm12, %xmm1 + vfmaddpd %xmm2, %xmm6 , %xmm12, %xmm2 + vfmaddpd %xmm3, %xmm7 , %xmm12, %xmm3 + + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L5X: +#endif + + vxorps %xmm8 , %xmm8, %xmm8 + vxorps %xmm9 , %xmm9, %xmm9 + vmovhpd -16 * SIZE(A1, LDA), %xmm8 , %xmm8 + vmovhpd -16 * SIZE(A2, LDA), %xmm9 , %xmm9 + + movq M, I + sarq $3, I + jle .L55 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L53 + ALIGN_4 + +.L52: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + VMOVUPS_A1(-12 * SIZE, A1, %xmm4) + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L52 + ALIGN_4 + +.L53: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L55: + testq $4, M + jle .L56 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + + vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L56: + testq $2, M + jle .L57 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 + + vshufpd $1, %xmm7, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L57: + testq $1, M + je .L58 + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm6 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + + vshufpd $1, %xmm8, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + + vfmaddpd %xmm2 , %xmm6 , %xmm12 , %xmm2 + + vshufpd $1, %xmm9, %xmm9 , %xmm9 + vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 + + ALIGN_4 + +.L58: + vhaddpd %xmm1, %xmm0 , %xmm0 + vhaddpd %xmm3, %xmm2 , %xmm2 + + vmulpd ALPHA, %xmm0 , %xmm0 + vmulpd ALPHA, %xmm2 , %xmm2 + + cmpq $SIZE, INCY + jne .L59 + + vmovups 0 * SIZE(Y), %xmm4 + vmovups 2 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovups %xmm0, 0 * SIZE(Y1) + vmovups %xmm2, 2 * SIZE(Y1) + addq $4 * SIZE, Y1 + + cmpq $4, N + jge .L51 + jmp .L60 + ALIGN_4 + +.L59: + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + vmovsd (Y), %xmm5 + addq INCY, Y + vmovhpd (Y), %xmm5 , %xmm5 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + vaddpd %xmm5, %xmm2 , %xmm2 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + vmovlpd %xmm2, (Y1) + addq INCY, Y1 + vmovhpd %xmm2, (Y1) + addq INCY, Y1 + cmpq $4, N + jge .L51 + ALIGN_4 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L6X + + vmovsd -16 * SIZE(X1), %xmm12 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm5 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + vfmaddpd %xmm1 , %xmm5 , %xmm12 , %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L6X: +#endif + + vxorps %xmm8 , %xmm8, %xmm8 + vmovhpd -16 * SIZE(A2), %xmm8 , %xmm8 + + movq M, I + sarq $3, I + jle .L65 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + VMOVUPS_A1(-13 * SIZE, A2, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L63 + ALIGN_4 + +.L62: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A1(-11 * SIZE, A2, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm7, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1 + VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) + VMOVUPS_A1(-7 * SIZE, A2, %xmm5) + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm9 , %xmm9 + vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1 + VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) + VMOVUPS_A1(-5 * SIZE, A2, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L62 + ALIGN_4 + +.L63: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_A1(-11 * SIZE, A2, %xmm9) + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm7, %xmm5 , %xmm5 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm9, %xmm7 , %xmm7 + vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1 + + vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm8, %xmm9 , %xmm9 + vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1 + + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L65: + testq $4, M + jle .L66 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + VMOVUPS_A1(-13 * SIZE, A2, %xmm7) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + + vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 + + vshufpd $1, %xmm7, %xmm5 , %xmm5 + vmovups %xmm7, %xmm8 + vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L66: + testq $2, M + jle .L67 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + vshufpd $1, %xmm5, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + movaps %xmm5, %xmm8 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L67: + testq $1, M + je .L68 + + vmovsd -16 * SIZE(X1), %xmm12 + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vshufpd $1, %xmm8, %xmm8 , %xmm8 + vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 + ALIGN_4 + +.L68: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + vmulpd ALPHA, %xmm0 , %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + vmovhpd (Y), %xmm4 , %xmm4 + addq INCY, Y + + vaddpd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 + vmovhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L61 +#endif + ALIGN_4 + +.L70: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + vxorps %xmm0 , %xmm0, %xmm0 + vxorps %xmm1 , %xmm1, %xmm1 + vxorps %xmm2 , %xmm2, %xmm2 + vxorps %xmm3 , %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L7X + + vmovsd -16 * SIZE(X1), %xmm12 + vmovsd -16 * SIZE(A1), %xmm4 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L7X: +#endif + movq M, I + sarq $3, I + jle .L75 + + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L73 + ALIGN_4 + +.L72: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) + VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L72 + ALIGN_4 + +.L73: + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L75: + testq $4, M + jle .L76 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L76: + testq $2, M + jle .L77 + + VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L77: + testq $1, M + je .L78 + + vmovsd -16 * SIZE(X1), %xmm12 + vmovsd -16 * SIZE(A1), %xmm4 + + vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 + ALIGN_4 + +.L78: + vaddpd %xmm2, %xmm0 , %xmm0 + vaddpd %xmm3, %xmm1 , %xmm1 + + vaddpd %xmm1, %xmm0 , %xmm0 + + vhaddpd %xmm1, %xmm0 , %xmm0 + + vmulsd ALPHA, %xmm0 , %xmm0 + + vmovsd (Y), %xmm4 + addq INCY, Y + + vaddsd %xmm4, %xmm0 , %xmm0 + + vmovlpd %xmm0, (Y1) + addq INCY, Y1 +#endif + ALIGN_4 + +.L999: + leaq (, M, SIZE), %rax + addq %rax,AA + jmp .L0x; + ALIGN_4 + +.L999x: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_4 + + EPILOGUE