From f67fa62851735e93a5fdb2166f70071d73f8727e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 15 Jun 2013 16:42:37 +0200 Subject: [PATCH] added dgemv_n_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/dgemv_n_bulldozer.S | 2405 +++++++++++++++++++++++++++++ param.h | 1 + 3 files changed, 2408 insertions(+) create mode 100644 kernel/x86_64/dgemv_n_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 8b3d1084a..8261bf42f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,6 +1,8 @@ ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t_dup.S +DGEMVNKERNEL = dgemv_n_bulldozer.S + SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/dgemv_n_bulldozer.S b/kernel/x86_64/dgemv_n_bulldozer.S new file mode 100644 index 000000000..dcd7af7aa --- /dev/null +++ b/kernel/x86_64/dgemv_n_bulldozer.S @@ -0,0 +1,2405 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define A_PRE 256 + +#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS +#define VMOVUPS_YL1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_YS1(OFF, ADDR, REGS) vmovups REGS, OFF(ADDR) + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + vmovsd %xmm0, ALPHA +#else + vmovsd %xmm3, ALPHA +#endif + + leaq -1(INCY), %rax + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + +#ifdef ALIGNED_ACCESS + leaq -1 (M), MM + testq $SIZE, A + cmoveq M, MM +#endif + + testq N, N # if n <= 0 goto END + jle .L999 + testq M, M # if n <= 0 goto END + jle .L999 + +#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) +#ifndef NOCOPY_UNALIGNED + movq Y, Y1 + andq $0xf, Y1 + orq Y1, %rax +#endif + testq %rax, %rax + cmoveq Y, BUFFER + je .L10 +#endif + + movq BUFFER, Y1 + + vxorpd %xmm4, %xmm4, %xmm4 + + movq M, %rax + addq $16, %rax + sarq $4, %rax + ALIGN_3 + +.L01: + vmovups %xmm4, 0 * SIZE(Y1) + vmovups %xmm4, 2 * SIZE(Y1) + vmovups %xmm4, 4 * SIZE(Y1) + vmovups %xmm4, 6 * SIZE(Y1) + vmovups %xmm4, 8 * SIZE(Y1) + vmovups %xmm4, 10 * SIZE(Y1) + vmovups %xmm4, 12 * SIZE(Y1) + vmovups %xmm4, 14 * SIZE(Y1) + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + +#ifdef ALIGNED_ACCESS + leaq SIZE(BUFFER), %rax + testq $SIZE, A + cmovne %rax, BUFFER + + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 4), A2 + leaq (A, LDA, 8), A + + vmovddup (X), %xmm8 + addq INCX, X + vmovddup (X), %xmm9 + addq INCX, X + vmovddup (X), %xmm10 + addq INCX, X + vmovddup (X), %xmm11 + addq INCX, X + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + vmovddup (X), %xmm14 + addq INCX, X + vmovddup (X), %xmm15 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm8 , %xmm8 + vmulpd %xmm0, %xmm9 , %xmm9 + vmulpd %xmm0, %xmm10 , %xmm10 + vmulpd %xmm0, %xmm11 , %xmm11 + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + vmulpd %xmm0, %xmm14 , %xmm14 + vmulpd %xmm0, %xmm15 , %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A1, LDA3), %xmm7 + + + vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0 + + vmovsd -16 * SIZE(A2), %xmm4 + vmovsd -16 * SIZE(A2, LDA), %xmm5 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A2, LDA3), %xmm7 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L1X: +#endif + + movq MM, I + sarq $3, I + jle .L15 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A1(-12 * SIZE, A1, %xmm6) + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L14 + ALIGN_3 + +.L13: + + + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + prefetchnta A_PRE(A1,LDA,1) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + prefetchnta A_PRE(A1,LDA,2) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + + + vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + prefetchnta A_PRE(A1,LDA3,1) + VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + prefetchnta A_PRE(A2) + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + prefetchnta A_PRE(A2,LDA,1) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + prefetchnta A_PRE(A2,LDA,2) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + + + vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + prefetchnta A_PRE(A2,LDA3,1) + VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3 + + VMOVUPS_A1( -8 * SIZE, A1, %xmm4) + VMOVUPS_A1( -6 * SIZE, A1, %xmm5) + prefetchnta A_PRE(A1) + VMOVUPS_A1( -4 * SIZE, A1, %xmm6) + VMOVUPS_A1( -2 * SIZE, A1, %xmm7) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + + vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + + vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $4, MM + je .L16 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm9 , %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm9 , %xmm7 , %xmm1 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm11, %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm11, %xmm7 , %xmm1 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm13, %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm13, %xmm7 , %xmm1 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm15, %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm15, %xmm7 , %xmm1 + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testq $2, MM + je .L17 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm0 , %xmm9 , %xmm5 , %xmm0 + vfmaddpd %xmm0 , %xmm10, %xmm6 , %xmm0 + vfmaddpd %xmm0 , %xmm11, %xmm7 , %xmm0 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddpd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddpd %xmm0 , %xmm15, %xmm7 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, MM + je .L18 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A1, LDA3), %xmm7 + + vmovsd -16 * SIZE(Y1), %xmm0 + + + vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0 + + vmovsd -16 * SIZE(A2), %xmm4 + vmovsd -16 * SIZE(A2, LDA), %xmm5 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A2, LDA3), %xmm7 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 + + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L18: + cmpq $8, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + vmovddup (X), %xmm14 + addq INCX, X + vmovddup (X), %xmm15 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + vmulpd %xmm0, %xmm14 , %xmm14 + vmulpd %xmm0, %xmm15 , %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A2), %xmm6 + vmovsd -16 * SIZE(A2, LDA), %xmm7 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L2X: +#endif + + movq MM, I + sarq $3, I + jle .L25 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L24 + ALIGN_3 + +.L23: + + + + vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm0) + VMOVUPS_A1(-14 * SIZE, A2, %xmm1) + prefetchnta A_PRE(A2) + VMOVUPS_A1(-12 * SIZE, A2, %xmm2) + VMOVUPS_A1(-10 * SIZE, A2, %xmm3) + + vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + prefetchnta A_PRE(A2, LDA, 1) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11 + + VMOVUPS_A1( -8 * SIZE, A1, %xmm0) + VMOVUPS_A1( -6 * SIZE, A1, %xmm1) + prefetchnta A_PRE(A1) + VMOVUPS_A1( -4 * SIZE, A1, %xmm2) + VMOVUPS_A1( -2 * SIZE, A1, %xmm3) + + vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11 + + VMOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) + prefetchnta A_PRE(A1, LDA, 1) + VMOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + + vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm0) + VMOVUPS_A1(-14 * SIZE, A2, %xmm1) + VMOVUPS_A1(-12 * SIZE, A2, %xmm2) + VMOVUPS_A1(-10 * SIZE, A2, %xmm3) + + vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11 + + vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $4, MM + je .L26 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm0) + VMOVUPS_A1(-14 * SIZE, A2, %xmm1) + + vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testq $2, MM + je .L27 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + VMOVUPS_A1(-16 * SIZE, A2, %xmm10) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12, %xmm8 , %xmm0 + vfmaddpd %xmm0 , %xmm13, %xmm9 , %xmm0 + vfmaddpd %xmm0 , %xmm14, %xmm10, %xmm0 + vfmaddpd %xmm0 , %xmm15, %xmm11, %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, MM +#if GEMV_UNROLL == 4 + je .L28 +#else + je .L30 +#endif + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A2), %xmm10 + vmovsd -16 * SIZE(A2, LDA), %xmm11 + + vfmaddsd %xmm0 , %xmm12, %xmm8 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm9 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm10, %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm11, %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 4 +.L28: + cmpq $4, N + jge .L21 + ALIGN_3 + +#endif + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm5 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L3X: +#endif + + movq MM, I + sarq $3, I + jle .L35 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + + decq I + jle .L34 + ALIGN_3 + +.L33: + + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + vmovups -8 * SIZE(A1), %xmm0 + vmovups -6 * SIZE(A1), %xmm1 + prefetchnta A_PRE(A1) + vmovups -4 * SIZE(A1), %xmm2 + vmovups -2 * SIZE(A1), %xmm3 + + + vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 + prefetchnta A_PRE(A2) + vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11 + + vmovups -8 * SIZE(A2), %xmm4 + vmovups -6 * SIZE(A2), %xmm5 + vmovups -4 * SIZE(A2), %xmm6 + vmovups -2 * SIZE(A2) , %xmm7 + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $4, MM + je .L36 + + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + + vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L36: + testq $2, MM + je .L37 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + VMOVUPS_A1(-16 * SIZE, A2, %xmm9) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 + vfmaddpd %xmm0 , %xmm13 , %xmm9 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L38 +#else + je .L40 +#endif + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A2), %xmm9 + + vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm9 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L38: + cmpq $2, N + jge .L31 + ALIGN_3 + +#endif + +.L40: + cmpq $1, N + jl .L900 +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + + vmovddup (X), %xmm12 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L4X: +#endif + + movq MM, I + sarq $3, I + jle .L45 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L44 + ALIGN_3 + +.L43: + + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + VMOVUPS_A1( -8 * SIZE, A1, %xmm0) + VMOVUPS_A1( -6 * SIZE, A1, %xmm1) + VMOVUPS_A1( -4 * SIZE, A1, %xmm2) + VMOVUPS_A1( -2 * SIZE, A1, %xmm3) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L43 + ALIGN_3 + +.L44: + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L45: + testq $4, MM + je .L46 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L46: + testq $2, MM + je .L47 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L47: + testq $1, MM + je .L900 + + vmovsd -16 * SIZE(Y1), %xmm0 + vmovsd -16 * SIZE(A1), %xmm8 + + vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#ifdef ALIGNED_ACCESS + jmp .L900 + ALIGN_3 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + vmovddup (X), %xmm14 + addq INCX, X + vmovddup (X), %xmm15 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + vmulpd %xmm0, %xmm14 , %xmm14 + vmulpd %xmm0, %xmm15 , %xmm15 + + testq $SIZE, A + je .L5X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A2), %xmm6 + vmovsd -16 * SIZE(A2, LDA), %xmm7 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15 , %xmm7 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L5X: + movhpd -16 * SIZE(A1, LDA), %xmm8 + movhpd -16 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $3, I + jle .L55 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A1(-12 * SIZE, A1, %xmm6) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L54 + ALIGN_3 + +.L53: + + + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + prefetchnta A_PRE(A1, LDA, 1) + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + + + shufpd $1, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + + prefetchnta A_PRE(A2) + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + + + vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + prefetchnta A_PRE(A2, LDA, 1) + vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + + + shufpd $1, %xmm4, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 + VMOVUPS_A1( -8 * SIZE, A1, %xmm4) + + prefetchnta A_PRE(A1) + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 + VMOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm9, %xmm6 + vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 + VMOVUPS_A1( -4 * SIZE, A1, %xmm6) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L53 + ALIGN_3 + + +.L54: + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + + vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 + shufpd $1, %xmm9, %xmm6 + vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L55: + testq $4, MM + je .L56 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm7, %xmm8 + shufpd $1, %xmm7, %xmm6 + vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + + vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 + + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + movaps %xmm7, %xmm9 + shufpd $1, %xmm7, %xmm6 + vfmaddpd %xmm1 , %xmm15 , %xmm6 , %xmm1 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L56: + testq $2, MM + je .L57 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A1(-16 * SIZE, A2, %xmm6) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + shufpd $1, %xmm5, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm5, %xmm8 + vfmaddpd %xmm0 , %xmm14 , %xmm6 , %xmm0 + shufpd $1, %xmm7, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + movaps %xmm7, %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L57: + testq $1, MM + je .L58 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm4 + shufpd $1, %xmm8, %xmm8 + vmovsd -16 * SIZE(A2), %xmm6 + shufpd $1, %xmm9, %xmm9 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 + vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15 , %xmm9 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L58: + cmpq $4, N + jge .L51 + ALIGN_3 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + + testq $SIZE, A + je .L6X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm5 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L6X: + movhpd -16 * SIZE(A2), %xmm8 + + movq MM, I + sarq $3, I + jle .L65 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A1(-12 * SIZE, A1, %xmm6) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L64 + ALIGN_3 + +.L63: + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A1(-15 * SIZE, A2, %xmm4) + + prefetchnta A_PRE(A2) + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A1(-13 * SIZE, A2, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A1(-11 * SIZE, A2, %xmm6) + + + shufpd $1, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + VMOVUPS_A1( -8 * SIZE, A1, %xmm4) + + prefetchnta A_PRE(A1) + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + VMOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + VMOVUPS_A1( -4 * SIZE, A1, %xmm6) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L63 + ALIGN_3 + +.L64: + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A1(-15 * SIZE, A2, %xmm4) + + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A1(-13 * SIZE, A2, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A1(-11 * SIZE, A2, %xmm6) + + shufpd $0x01, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $0x01, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + + shufpd $0x01, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + shufpd $0x01, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L65: + testq $4, MM + je .L66 + + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm6) + VMOVUPS_A1(-13 * SIZE, A2, %xmm7) + + shufpd $0x01, %xmm6, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm7, %xmm8 + shufpd $0x01, %xmm7, %xmm6 + vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L66: + testq $2, MM + je .L67 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + shufpd $0x01, %xmm5, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm5, %xmm8 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L67: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L68 +#else + je .L70 +#endif + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm4 + vshufpd $0x01, %xmm8, %xmm8 , %xmm8 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L68: + cmpq $2, N + jge .L61 + ALIGN_3 + +#endif + +.L70: + cmpq $1, N + jl .L900 + +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + + vmovddup (X), %xmm12 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + + testq $SIZE, A + je .L7X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L7X: + + movq MM, I + sarq $3, I + jle .L75 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L74 + ALIGN_3 + +.L73: + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + VMOVUPS_A1( -8 * SIZE, A1, %xmm0) + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + VMOVUPS_A1( -6 * SIZE, A1, %xmm1) + + prefetchnta A_PRE(A1) + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + VMOVUPS_A1( -4 * SIZE, A1, %xmm2) + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + VMOVUPS_A1( -2 * SIZE, A1, %xmm3) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L73 + ALIGN_3 + +.L74: + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L75: + testq $4, MM + je .L76 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L76: + testq $2, MM + je .L77 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L77: + testq $1, MM + je .L900 + + vmovsd -16 * SIZE(Y1), %xmm0 + vmovsd -16 * SIZE(A1), %xmm8 + + vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) +#endif + ALIGN_3 + + +.L900: +#ifndef COPY_FORCE + cmpq Y, BUFFER + je .L999 +#endif + + cmpq $SIZE, INCY + jne .L950 + + testq $SIZE, Y + je .L910 + + vmovsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + vmovsd %xmm0, (Y) + + addq $SIZE, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L910: + testq $SIZE, BUFFER + jne .L920 + + movq M, %rax + sarq $3, %rax + jle .L914 + ALIGN_3 + +.L912: + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + vmovups 4 * SIZE(Y), %xmm2 + vmovups 6 * SIZE(Y), %xmm3 + + vmovups 0 * SIZE(BUFFER), %xmm4 + vmovups 2 * SIZE(BUFFER), %xmm5 + vmovups 4 * SIZE(BUFFER), %xmm6 + vmovups 6 * SIZE(BUFFER), %xmm7 + + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + vmovups %xmm2, 4 * SIZE(Y) + vmovups %xmm3, 6 * SIZE(Y) + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L912 + ALIGN_3 + +.L914: + testq $7, M + jle .L999 + + testq $4, M + jle .L915 + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + + vmovups 0 * SIZE(BUFFER), %xmm4 + vmovups 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L915: + testq $2, M + jle .L916 + + vmovups (Y), %xmm0 + + vmovups (BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + vmovups %xmm0, (Y) + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L916: + testq $1, M + jle .L999 + + vmovsd (Y), %xmm0 + + vmovsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + vmovsd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L920: + vmovups -1 * SIZE(BUFFER), %xmm4 + + movq M, %rax + sarq $3, %rax + jle .L924 + ALIGN_3 + +.L922: + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + vmovups 4 * SIZE(Y), %xmm2 + vmovups 6 * SIZE(Y), %xmm3 + + vmovups 1 * SIZE(BUFFER), %xmm5 + vmovups 3 * SIZE(BUFFER), %xmm6 + vmovups 5 * SIZE(BUFFER), %xmm7 + vmovups 7 * SIZE(BUFFER), %xmm8 + + shufpd $0x01, %xmm5, %xmm4 + shufpd $0x01, %xmm6, %xmm5 + shufpd $0x01, %xmm7, %xmm6 + shufpd $0x01, %xmm8, %xmm7 + + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + vmovups %xmm2, 4 * SIZE(Y) + vmovups %xmm3, 6 * SIZE(Y) + + vmovups %xmm8, %xmm4 + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L922 + ALIGN_3 + +.L924: + testq $7, M + jle .L999 + + testq $4, M + jle .L925 + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + + vmovups 1 * SIZE(BUFFER), %xmm5 + vmovups 3 * SIZE(BUFFER), %xmm6 + + shufpd $0x01, %xmm5, %xmm4 + shufpd $0x01, %xmm6, %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + + vmovups %xmm6, %xmm4 + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L925: + testq $2, M + jle .L926 + + vmovups (Y), %xmm0 + + vmovups 1 * SIZE(BUFFER), %xmm5 + + shufpd $0x01, %xmm5, %xmm4 + + addpd %xmm4, %xmm0 + + vmovups %xmm0, (Y) + + movaps %xmm5, %xmm4 + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L926: + testq $1, M + jle .L999 + + vmovsd (Y), %xmm0 + + vshufpd $0x01, %xmm4 ,%xmm4, %xmm4 + + addsd %xmm4, %xmm0 + + vmovsd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L950: + testq $SIZE, BUFFER + je .L960 + + vmovsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + vmovsd %xmm0, (Y) + + addq INCY, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L960: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L964 + ALIGN_3 + +.L962: + vmovsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + vmovups 0 * SIZE(BUFFER), %xmm4 + + vmovsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + vmovups 2 * SIZE(BUFFER), %xmm5 + + vmovsd (Y), %xmm2 + addq INCY, Y + movhpd (Y), %xmm2 + addq INCY, Y + + vmovups 4 * SIZE(BUFFER), %xmm6 + + addpd %xmm4, %xmm0 + + vmovsd (Y), %xmm3 + addq INCY, Y + movhpd (Y), %xmm3 + addq INCY, Y + + vmovups 6 * SIZE(BUFFER), %xmm7 + + addpd %xmm5, %xmm1 + + vmovsd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addpd %xmm6, %xmm2 + + vmovsd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addpd %xmm7, %xmm3 + + vmovsd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + vmovsd %xmm3, (Y1) + addq INCY, Y1 + movhpd %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + decq %rax + jg .L962 + ALIGN_3 + +.L964: + testq $7, M + jle .L999 + + testq $4, M + jle .L965 + + vmovsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + vmovups 0 * SIZE(BUFFER), %xmm4 + + vmovsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + vmovups 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + vmovsd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + vmovsd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L965: + testq $2, M + jle .L966 + + vmovsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + vmovups 0 * SIZE(BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + vmovsd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L966: + testq $1, M + jle .L999 + + vmovsd (Y), %xmm0 + + vmovsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + vmovsd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + + ret + EPILOGUE diff --git a/param.h b/param.h index 0357c1323..76cc3236f 100644 --- a/param.h +++ b/param.h @@ -187,6 +187,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 4 #define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 #endif #if 0