From f67fa62851735e93a5fdb2166f70071d73f8727e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 15 Jun 2013 16:42:37 +0200 Subject: [PATCH 1/6] added dgemv_n_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/dgemv_n_bulldozer.S | 2405 +++++++++++++++++++++++++++++ param.h | 1 + 3 files changed, 2408 insertions(+) create mode 100644 kernel/x86_64/dgemv_n_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 8b3d1084a..8261bf42f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,6 +1,8 @@ ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t_dup.S +DGEMVNKERNEL = dgemv_n_bulldozer.S + SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/dgemv_n_bulldozer.S b/kernel/x86_64/dgemv_n_bulldozer.S new file mode 100644 index 000000000..dcd7af7aa --- /dev/null +++ b/kernel/x86_64/dgemv_n_bulldozer.S @@ -0,0 +1,2405 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define A_PRE 256 + +#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS +#define VMOVUPS_YL1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_YS1(OFF, ADDR, REGS) vmovups REGS, OFF(ADDR) + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + vmovsd %xmm0, ALPHA +#else + vmovsd %xmm3, ALPHA +#endif + + leaq -1(INCY), %rax + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + +#ifdef ALIGNED_ACCESS + leaq -1 (M), MM + testq $SIZE, A + cmoveq M, MM +#endif + + testq N, N # if n <= 0 goto END + jle .L999 + testq M, M # if n <= 0 goto END + jle .L999 + +#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) +#ifndef NOCOPY_UNALIGNED + movq Y, Y1 + andq $0xf, Y1 + orq Y1, %rax +#endif + testq %rax, %rax + cmoveq Y, BUFFER + je .L10 +#endif + + movq BUFFER, Y1 + + vxorpd %xmm4, %xmm4, %xmm4 + + movq M, %rax + addq $16, %rax + sarq $4, %rax + ALIGN_3 + +.L01: + vmovups %xmm4, 0 * SIZE(Y1) + vmovups %xmm4, 2 * SIZE(Y1) + vmovups %xmm4, 4 * SIZE(Y1) + vmovups %xmm4, 6 * SIZE(Y1) + vmovups %xmm4, 8 * SIZE(Y1) + vmovups %xmm4, 10 * SIZE(Y1) + vmovups %xmm4, 12 * SIZE(Y1) + vmovups %xmm4, 14 * SIZE(Y1) + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + +#ifdef ALIGNED_ACCESS + leaq SIZE(BUFFER), %rax + testq $SIZE, A + cmovne %rax, BUFFER + + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 4), A2 + leaq (A, LDA, 8), A + + vmovddup (X), %xmm8 + addq INCX, X + vmovddup (X), %xmm9 + addq INCX, X + vmovddup (X), %xmm10 + addq INCX, X + vmovddup (X), %xmm11 + addq INCX, X + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + vmovddup (X), %xmm14 + addq INCX, X + vmovddup (X), %xmm15 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm8 , %xmm8 + vmulpd %xmm0, %xmm9 , %xmm9 + vmulpd %xmm0, %xmm10 , %xmm10 + vmulpd %xmm0, %xmm11 , %xmm11 + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + vmulpd %xmm0, %xmm14 , %xmm14 + vmulpd %xmm0, %xmm15 , %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A1, LDA3), %xmm7 + + + vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0 + + vmovsd -16 * SIZE(A2), %xmm4 + vmovsd -16 * SIZE(A2, LDA), %xmm5 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A2, LDA3), %xmm7 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L1X: +#endif + + movq MM, I + sarq $3, I + jle .L15 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A1(-12 * SIZE, A1, %xmm6) + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L14 + ALIGN_3 + +.L13: + + + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + prefetchnta A_PRE(A1,LDA,1) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + prefetchnta A_PRE(A1,LDA,2) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + + + vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + prefetchnta A_PRE(A1,LDA3,1) + VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + prefetchnta A_PRE(A2) + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + prefetchnta A_PRE(A2,LDA,1) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + prefetchnta A_PRE(A2,LDA,2) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + + + vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + prefetchnta A_PRE(A2,LDA3,1) + VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + + + vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3 + + VMOVUPS_A1( -8 * SIZE, A1, %xmm4) + VMOVUPS_A1( -6 * SIZE, A1, %xmm5) + prefetchnta A_PRE(A1) + VMOVUPS_A1( -4 * SIZE, A1, %xmm6) + VMOVUPS_A1( -2 * SIZE, A1, %xmm7) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + + vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + + vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3 + + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1 + vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2 + vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $4, MM + je .L16 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm9 , %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm9 , %xmm7 , %xmm1 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm11, %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm11, %xmm7 , %xmm1 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm13, %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm13, %xmm7 , %xmm1 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) + VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 + vfmaddpd %xmm0 , %xmm15, %xmm6 , %xmm0 + vfmaddpd %xmm1 , %xmm15, %xmm7 , %xmm1 + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testq $2, MM + je .L17 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) + VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddpd %xmm0 , %xmm9 , %xmm5 , %xmm0 + vfmaddpd %xmm0 , %xmm10, %xmm6 , %xmm0 + vfmaddpd %xmm0 , %xmm11, %xmm7 , %xmm0 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) + VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) + + vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddpd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddpd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddpd %xmm0 , %xmm15, %xmm7 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, MM + je .L18 + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A1, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A1, LDA3), %xmm7 + + vmovsd -16 * SIZE(Y1), %xmm0 + + + vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0 + + vmovsd -16 * SIZE(A2), %xmm4 + vmovsd -16 * SIZE(A2, LDA), %xmm5 + vmovsd -16 * SIZE(A2, LDA, 2), %xmm6 + vmovsd -16 * SIZE(A2, LDA3), %xmm7 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 + + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L18: + cmpq $8, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + vmovddup (X), %xmm14 + addq INCX, X + vmovddup (X), %xmm15 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + vmulpd %xmm0, %xmm14 , %xmm14 + vmulpd %xmm0, %xmm15 , %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A2), %xmm6 + vmovsd -16 * SIZE(A2, LDA), %xmm7 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L2X: +#endif + + movq MM, I + sarq $3, I + jle .L25 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L24 + ALIGN_3 + +.L23: + + + + vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm0) + VMOVUPS_A1(-14 * SIZE, A2, %xmm1) + prefetchnta A_PRE(A2) + VMOVUPS_A1(-12 * SIZE, A2, %xmm2) + VMOVUPS_A1(-10 * SIZE, A2, %xmm3) + + vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + prefetchnta A_PRE(A2, LDA, 1) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11 + + VMOVUPS_A1( -8 * SIZE, A1, %xmm0) + VMOVUPS_A1( -6 * SIZE, A1, %xmm1) + prefetchnta A_PRE(A1) + VMOVUPS_A1( -4 * SIZE, A1, %xmm2) + VMOVUPS_A1( -2 * SIZE, A1, %xmm3) + + vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11 + + VMOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) + prefetchnta A_PRE(A1, LDA, 1) + VMOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + + vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm0) + VMOVUPS_A1(-14 * SIZE, A2, %xmm1) + VMOVUPS_A1(-12 * SIZE, A2, %xmm2) + VMOVUPS_A1(-10 * SIZE, A2, %xmm3) + + vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11 + + vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $4, MM + je .L26 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 + + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm0) + VMOVUPS_A1(-14 * SIZE, A2, %xmm1) + + vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 + + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testq $2, MM + je .L27 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + VMOVUPS_A1(-16 * SIZE, A2, %xmm10) + VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12, %xmm8 , %xmm0 + vfmaddpd %xmm0 , %xmm13, %xmm9 , %xmm0 + vfmaddpd %xmm0 , %xmm14, %xmm10, %xmm0 + vfmaddpd %xmm0 , %xmm15, %xmm11, %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, MM +#if GEMV_UNROLL == 4 + je .L28 +#else + je .L30 +#endif + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A1, LDA), %xmm9 + vmovsd -16 * SIZE(A2), %xmm10 + vmovsd -16 * SIZE(A2, LDA), %xmm11 + + vfmaddsd %xmm0 , %xmm12, %xmm8 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm9 , %xmm0 + vfmaddsd %xmm0 , %xmm14, %xmm10, %xmm0 + vfmaddsd %xmm0 , %xmm15, %xmm11, %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 4 +.L28: + cmpq $4, N + jge .L21 + ALIGN_3 + +#endif + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm5 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L3X: +#endif + + movq MM, I + sarq $3, I + jle .L35 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + + decq I + jle .L34 + ALIGN_3 + +.L33: + + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + vmovups -8 * SIZE(A1), %xmm0 + vmovups -6 * SIZE(A1), %xmm1 + prefetchnta A_PRE(A1) + vmovups -4 * SIZE(A1), %xmm2 + vmovups -2 * SIZE(A1), %xmm3 + + + vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 + prefetchnta A_PRE(A2) + vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11 + + vmovups -8 * SIZE(A2), %xmm4 + vmovups -6 * SIZE(A2), %xmm5 + vmovups -4 * SIZE(A2), %xmm6 + vmovups -2 * SIZE(A2) , %xmm7 + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 + vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10 + vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $4, MM + je .L36 + + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + + vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 + vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L36: + testq $2, MM + je .L37 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + VMOVUPS_A1(-16 * SIZE, A2, %xmm9) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 + vfmaddpd %xmm0 , %xmm13 , %xmm9 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L38 +#else + je .L40 +#endif + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm8 + vmovsd -16 * SIZE(A2), %xmm9 + + vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm9 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L38: + cmpq $2, N + jge .L31 + ALIGN_3 + +#endif + +.L40: + cmpq $1, N + jl .L900 +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + + vmovddup (X), %xmm12 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L4X: +#endif + + movq MM, I + sarq $3, I + jle .L45 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L44 + ALIGN_3 + +.L43: + + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + VMOVUPS_A1( -8 * SIZE, A1, %xmm0) + VMOVUPS_A1( -6 * SIZE, A1, %xmm1) + VMOVUPS_A1( -4 * SIZE, A1, %xmm2) + VMOVUPS_A1( -2 * SIZE, A1, %xmm3) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L43 + ALIGN_3 + +.L44: + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L45: + testq $4, MM + je .L46 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L46: + testq $2, MM + je .L47 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L47: + testq $1, MM + je .L900 + + vmovsd -16 * SIZE(Y1), %xmm0 + vmovsd -16 * SIZE(A1), %xmm8 + + vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#ifdef ALIGNED_ACCESS + jmp .L900 + ALIGN_3 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + vmovddup (X), %xmm14 + addq INCX, X + vmovddup (X), %xmm15 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + vmulpd %xmm0, %xmm14 , %xmm14 + vmulpd %xmm0, %xmm15 , %xmm15 + + testq $SIZE, A + je .L5X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A1, LDA), %xmm5 + vmovsd -16 * SIZE(A2), %xmm6 + vmovsd -16 * SIZE(A2, LDA), %xmm7 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0 + vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15 , %xmm7 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L5X: + movhpd -16 * SIZE(A1, LDA), %xmm8 + movhpd -16 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $3, I + jle .L55 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A1(-12 * SIZE, A1, %xmm6) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L54 + ALIGN_3 + +.L53: + + + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + prefetchnta A_PRE(A1, LDA, 1) + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + + + shufpd $1, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + + prefetchnta A_PRE(A2) + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + + + vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + prefetchnta A_PRE(A2, LDA, 1) + vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + + + shufpd $1, %xmm4, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 + VMOVUPS_A1( -8 * SIZE, A1, %xmm4) + + prefetchnta A_PRE(A1) + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 + VMOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm9, %xmm6 + vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 + VMOVUPS_A1( -4 * SIZE, A1, %xmm6) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L53 + ALIGN_3 + + +.L54: + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + VMOVUPS_A1(-12 * SIZE, A2, %xmm6) + + vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2 + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 + VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 + shufpd $1, %xmm9, %xmm6 + vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L55: + testq $4, MM + je .L56 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) + VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm7, %xmm8 + shufpd $1, %xmm7, %xmm6 + vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 + + VMOVUPS_A1(-16 * SIZE, A2, %xmm4) + VMOVUPS_A1(-14 * SIZE, A2, %xmm5) + + vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 + + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) + VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + movaps %xmm7, %xmm9 + shufpd $1, %xmm7, %xmm6 + vfmaddpd %xmm1 , %xmm15 , %xmm6 , %xmm1 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L56: + testq $2, MM + je .L57 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + VMOVUPS_A1(-16 * SIZE, A2, %xmm6) + VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + shufpd $1, %xmm5, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm5, %xmm8 + vfmaddpd %xmm0 , %xmm14 , %xmm6 , %xmm0 + shufpd $1, %xmm7, %xmm9 + vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 + movaps %xmm7, %xmm9 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L57: + testq $1, MM + je .L58 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm4 + shufpd $1, %xmm8, %xmm8 + vmovsd -16 * SIZE(A2), %xmm6 + shufpd $1, %xmm9, %xmm9 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 + vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0 + vfmaddsd %xmm0 , %xmm15 , %xmm9 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L58: + cmpq $4, N + jge .L51 + ALIGN_3 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + + vmovddup (X), %xmm12 + addq INCX, X + vmovddup (X), %xmm13 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + vmulpd %xmm0, %xmm13 , %xmm13 + + testq $SIZE, A + je .L6X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(A2), %xmm5 + + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L6X: + movhpd -16 * SIZE(A2), %xmm8 + + movq MM, I + sarq $3, I + jle .L65 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + VMOVUPS_A1(-12 * SIZE, A1, %xmm6) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L64 + ALIGN_3 + +.L63: + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A1(-15 * SIZE, A2, %xmm4) + + prefetchnta A_PRE(A2) + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A1(-13 * SIZE, A2, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A1(-11 * SIZE, A2, %xmm6) + + + shufpd $1, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $1, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + VMOVUPS_A1( -8 * SIZE, A1, %xmm4) + + prefetchnta A_PRE(A1) + shufpd $1, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + VMOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + VMOVUPS_A1( -4 * SIZE, A1, %xmm6) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L63 + ALIGN_3 + +.L64: + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + VMOVUPS_A1(-10 * SIZE, A1, %xmm7) + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + VMOVUPS_A1(-15 * SIZE, A2, %xmm4) + + vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 + VMOVUPS_A1(-13 * SIZE, A2, %xmm5) + vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 + VMOVUPS_A1(-11 * SIZE, A2, %xmm6) + + shufpd $0x01, %xmm4, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + VMOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $0x01, %xmm5, %xmm4 + vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 + + shufpd $0x01, %xmm6, %xmm5 + vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 + shufpd $0x01, %xmm8, %xmm6 + vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L65: + testq $4, MM + je .L66 + + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-14 * SIZE, A1, %xmm5) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 + + VMOVUPS_A1(-15 * SIZE, A2, %xmm6) + VMOVUPS_A1(-13 * SIZE, A2, %xmm7) + + shufpd $0x01, %xmm6, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm7, %xmm8 + shufpd $0x01, %xmm7, %xmm6 + vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L66: + testq $2, MM + je .L67 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) + VMOVUPS_A1(-15 * SIZE, A2, %xmm5) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 + shufpd $0x01, %xmm5, %xmm8 + vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 + movaps %xmm5, %xmm8 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L67: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L68 +#else + je .L70 +#endif + + vmovsd -16 * SIZE(Y1), %xmm0 + + vmovsd -16 * SIZE(A1), %xmm4 + vshufpd $0x01, %xmm8, %xmm8 , %xmm8 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L68: + cmpq $2, N + jge .L61 + ALIGN_3 + +#endif + +.L70: + cmpq $1, N + jl .L900 + +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + + vmovddup (X), %xmm12 + addq INCX, X + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0, %xmm12 , %xmm12 + + testq $SIZE, A + je .L7X + + vmovsd -16 * SIZE(A1), %xmm4 + vmovsd -16 * SIZE(Y1), %xmm0 + + vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L7X: + + movq MM, I + sarq $3, I + jle .L75 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + VMOVUPS_A1(-12 * SIZE, A1, %xmm2) + VMOVUPS_A1(-10 * SIZE, A1, %xmm3) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L74 + ALIGN_3 + +.L73: + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + VMOVUPS_A1( -8 * SIZE, A1, %xmm0) + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + VMOVUPS_A1( -6 * SIZE, A1, %xmm1) + + prefetchnta A_PRE(A1) + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + VMOVUPS_A1( -4 * SIZE, A1, %xmm2) + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + VMOVUPS_A1( -2 * SIZE, A1, %xmm3) + + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + prefetchnta A_PRE(Y1) + VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L73 + ALIGN_3 + +.L74: + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 + VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 + VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L75: + testq $4, MM + je .L76 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm0) + VMOVUPS_A1(-14 * SIZE, A1, %xmm1) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 + VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L76: + testq $2, MM + je .L77 + + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) + + VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L77: + testq $1, MM + je .L900 + + vmovsd -16 * SIZE(Y1), %xmm0 + vmovsd -16 * SIZE(A1), %xmm8 + + vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 + + vmovsd %xmm0, -16 * SIZE(Y1) +#endif + ALIGN_3 + + +.L900: +#ifndef COPY_FORCE + cmpq Y, BUFFER + je .L999 +#endif + + cmpq $SIZE, INCY + jne .L950 + + testq $SIZE, Y + je .L910 + + vmovsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + vmovsd %xmm0, (Y) + + addq $SIZE, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L910: + testq $SIZE, BUFFER + jne .L920 + + movq M, %rax + sarq $3, %rax + jle .L914 + ALIGN_3 + +.L912: + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + vmovups 4 * SIZE(Y), %xmm2 + vmovups 6 * SIZE(Y), %xmm3 + + vmovups 0 * SIZE(BUFFER), %xmm4 + vmovups 2 * SIZE(BUFFER), %xmm5 + vmovups 4 * SIZE(BUFFER), %xmm6 + vmovups 6 * SIZE(BUFFER), %xmm7 + + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + vmovups %xmm2, 4 * SIZE(Y) + vmovups %xmm3, 6 * SIZE(Y) + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L912 + ALIGN_3 + +.L914: + testq $7, M + jle .L999 + + testq $4, M + jle .L915 + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + + vmovups 0 * SIZE(BUFFER), %xmm4 + vmovups 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L915: + testq $2, M + jle .L916 + + vmovups (Y), %xmm0 + + vmovups (BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + vmovups %xmm0, (Y) + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L916: + testq $1, M + jle .L999 + + vmovsd (Y), %xmm0 + + vmovsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + vmovsd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L920: + vmovups -1 * SIZE(BUFFER), %xmm4 + + movq M, %rax + sarq $3, %rax + jle .L924 + ALIGN_3 + +.L922: + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + vmovups 4 * SIZE(Y), %xmm2 + vmovups 6 * SIZE(Y), %xmm3 + + vmovups 1 * SIZE(BUFFER), %xmm5 + vmovups 3 * SIZE(BUFFER), %xmm6 + vmovups 5 * SIZE(BUFFER), %xmm7 + vmovups 7 * SIZE(BUFFER), %xmm8 + + shufpd $0x01, %xmm5, %xmm4 + shufpd $0x01, %xmm6, %xmm5 + shufpd $0x01, %xmm7, %xmm6 + shufpd $0x01, %xmm8, %xmm7 + + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + vmovups %xmm2, 4 * SIZE(Y) + vmovups %xmm3, 6 * SIZE(Y) + + vmovups %xmm8, %xmm4 + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L922 + ALIGN_3 + +.L924: + testq $7, M + jle .L999 + + testq $4, M + jle .L925 + + vmovups 0 * SIZE(Y), %xmm0 + vmovups 2 * SIZE(Y), %xmm1 + + vmovups 1 * SIZE(BUFFER), %xmm5 + vmovups 3 * SIZE(BUFFER), %xmm6 + + shufpd $0x01, %xmm5, %xmm4 + shufpd $0x01, %xmm6, %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + vmovups %xmm0, 0 * SIZE(Y) + vmovups %xmm1, 2 * SIZE(Y) + + vmovups %xmm6, %xmm4 + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L925: + testq $2, M + jle .L926 + + vmovups (Y), %xmm0 + + vmovups 1 * SIZE(BUFFER), %xmm5 + + shufpd $0x01, %xmm5, %xmm4 + + addpd %xmm4, %xmm0 + + vmovups %xmm0, (Y) + + movaps %xmm5, %xmm4 + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L926: + testq $1, M + jle .L999 + + vmovsd (Y), %xmm0 + + vshufpd $0x01, %xmm4 ,%xmm4, %xmm4 + + addsd %xmm4, %xmm0 + + vmovsd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L950: + testq $SIZE, BUFFER + je .L960 + + vmovsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + vmovsd %xmm0, (Y) + + addq INCY, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L960: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L964 + ALIGN_3 + +.L962: + vmovsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + vmovups 0 * SIZE(BUFFER), %xmm4 + + vmovsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + vmovups 2 * SIZE(BUFFER), %xmm5 + + vmovsd (Y), %xmm2 + addq INCY, Y + movhpd (Y), %xmm2 + addq INCY, Y + + vmovups 4 * SIZE(BUFFER), %xmm6 + + addpd %xmm4, %xmm0 + + vmovsd (Y), %xmm3 + addq INCY, Y + movhpd (Y), %xmm3 + addq INCY, Y + + vmovups 6 * SIZE(BUFFER), %xmm7 + + addpd %xmm5, %xmm1 + + vmovsd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addpd %xmm6, %xmm2 + + vmovsd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addpd %xmm7, %xmm3 + + vmovsd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + vmovsd %xmm3, (Y1) + addq INCY, Y1 + movhpd %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + decq %rax + jg .L962 + ALIGN_3 + +.L964: + testq $7, M + jle .L999 + + testq $4, M + jle .L965 + + vmovsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + vmovups 0 * SIZE(BUFFER), %xmm4 + + vmovsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + vmovups 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + vmovsd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + vmovsd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L965: + testq $2, M + jle .L966 + + vmovsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + vmovups 0 * SIZE(BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + vmovsd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L966: + testq $1, M + jle .L999 + + vmovsd (Y), %xmm0 + + vmovsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + vmovsd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + + ret + EPILOGUE diff --git a/param.h b/param.h index 0357c1323..76cc3236f 100644 --- a/param.h +++ b/param.h @@ -187,6 +187,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 4 #define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 #endif #if 0 From 7c8227101b2ad1d38180665ab6ffa17d60ea6ea9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 16 Jun 2013 12:50:45 +0200 Subject: [PATCH 2/6] cleanup of dgemv_n_bulldozer.S and optimization of inner loop --- kernel/x86_64/dgemv_n_bulldozer.S | 416 ++++++++++++------------------ 1 file changed, 168 insertions(+), 248 deletions(-) diff --git a/kernel/x86_64/dgemv_n_bulldozer.S b/kernel/x86_64/dgemv_n_bulldozer.S index dcd7af7aa..c954f1929 100644 --- a/kernel/x86_64/dgemv_n_bulldozer.S +++ b/kernel/x86_64/dgemv_n_bulldozer.S @@ -40,6 +40,8 @@ #include "common.h" #include "l2param.h" +#undef ALIGNED_ACCESS + #define A_PRE 256 #define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS @@ -304,11 +306,6 @@ sarq $3, I jle .L15 - VMOVUPS_A1(-16 * SIZE, A1, %xmm4) - VMOVUPS_A1(-14 * SIZE, A1, %xmm5) - VMOVUPS_A1(-12 * SIZE, A1, %xmm6) - VMOVUPS_A1(-10 * SIZE, A1, %xmm7) - VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) @@ -316,105 +313,60 @@ decq I jle .L14 - ALIGN_3 + .align 32 .L13: - vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) - prefetchnta A_PRE(A1,LDA,1) - VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) - - - vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) - VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) - prefetchnta A_PRE(A1,LDA,2) - VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) - VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) - - - vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) - prefetchnta A_PRE(A1,LDA3,1) - VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) - - - vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3 - - VMOVUPS_A1(-16 * SIZE, A2, %xmm4) - VMOVUPS_A1(-14 * SIZE, A2, %xmm5) - prefetchnta A_PRE(A2) - VMOVUPS_A1(-12 * SIZE, A2, %xmm6) - VMOVUPS_A1(-10 * SIZE, A2, %xmm7) - - - vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) - prefetchnta A_PRE(A2,LDA,1) - VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) - - - vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) - VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) - prefetchnta A_PRE(A2,LDA,2) - VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) - VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) - - - vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) - prefetchnta A_PRE(A2,LDA3,1) - VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) - - - vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3 - - VMOVUPS_A1( -8 * SIZE, A1, %xmm4) - VMOVUPS_A1( -6 * SIZE, A1, %xmm5) prefetchnta A_PRE(A1) - VMOVUPS_A1( -4 * SIZE, A1, %xmm6) - VMOVUPS_A1( -2 * SIZE, A1, %xmm7) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm8, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1) , %xmm8, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1) , %xmm8, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1) , %xmm8, %xmm3 + nop + + prefetchnta A_PRE(A1,LDA,1) + vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 1) , %xmm9 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 1) , %xmm9 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 1) , %xmm9 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 1) , %xmm9 , %xmm3 + + prefetchnta A_PRE(A1,LDA,2) + vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 2) , %xmm10, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 2) , %xmm10, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 2) , %xmm10, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 2) , %xmm10, %xmm3 + + prefetchnta A_PRE(A1,LDA3,1) + vfmaddpd %xmm0 , -16 * SIZE(A1, LDA3, 1) , %xmm11, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1, LDA3, 1) , %xmm11, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1, LDA3, 1) , %xmm11, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1, LDA3, 1) , %xmm11, %xmm3 + + prefetchnta A_PRE(A2) + vfmaddpd %xmm0 , -16 * SIZE(A2) , %xmm12, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2) , %xmm12, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm12, %xmm3 + nop + + prefetchnta A_PRE(A2,LDA,1) + vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 1) , %xmm13, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 1) , %xmm13, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 1) , %xmm13, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 1) , %xmm13, %xmm3 + + prefetchnta A_PRE(A2,LDA,2) + vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 2) , %xmm14, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 2) , %xmm14, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 2) , %xmm14, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 2) , %xmm14, %xmm3 + + prefetchnta A_PRE(A2,LDA3,1) + vfmaddpd %xmm0 , -16 * SIZE(A2, LDA3, 1) , %xmm15, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2, LDA3, 1) , %xmm15, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2, LDA3, 1) , %xmm15, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2, LDA3, 1) , %xmm15, %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) @@ -439,80 +391,48 @@ ALIGN_3 .L14: - vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3 - VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) - VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm8, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1) , %xmm8, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1) , %xmm8, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1) , %xmm8, %xmm3 - vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3 + vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 1) , %xmm9 , %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 1) , %xmm9 , %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 1) , %xmm9 , %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 1) , %xmm9 , %xmm3 - VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) - VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) - VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) - VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 2) , %xmm10, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 2) , %xmm10, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 2) , %xmm10, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 2) , %xmm10, %xmm3 - vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3 + vfmaddpd %xmm0 , -16 * SIZE(A1, LDA3, 1) , %xmm11, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A1, LDA3, 1) , %xmm11, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A1, LDA3, 1) , %xmm11, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A1, LDA3, 1) , %xmm11, %xmm3 - VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) - VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + vfmaddpd %xmm0 , -16 * SIZE(A2) , %xmm12, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2) , %xmm12, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm12, %xmm3 - vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3 + vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 1) , %xmm13, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 1) , %xmm13, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 1) , %xmm13, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 1) , %xmm13, %xmm3 - VMOVUPS_A1(-16 * SIZE, A2, %xmm4) - VMOVUPS_A1(-14 * SIZE, A2, %xmm5) - VMOVUPS_A1(-12 * SIZE, A2, %xmm6) - VMOVUPS_A1(-10 * SIZE, A2, %xmm7) + vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 2) , %xmm14, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 2) , %xmm14, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 2) , %xmm14, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 2) , %xmm14, %xmm3 - vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3 + vfmaddpd %xmm0 , -16 * SIZE(A2, LDA3, 1) , %xmm15, %xmm0 + vfmaddpd %xmm1 , -14 * SIZE(A2, LDA3, 1) , %xmm15, %xmm1 + vfmaddpd %xmm2 , -12 * SIZE(A2, LDA3, 1) , %xmm15, %xmm2 + vfmaddpd %xmm3 , -10 * SIZE(A2, LDA3, 1) , %xmm15, %xmm3 - VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) - VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) - vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) - VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) - VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) - VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) - - vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3 - - VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) - VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) - VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) - VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) - - vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0 - vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1 - vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2 - vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) @@ -1364,8 +1284,8 @@ ALIGN_3 .L5X: - movhpd -16 * SIZE(A1, LDA), %xmm8 - movhpd -16 * SIZE(A2, LDA), %xmm9 + vmovhpd -16 * SIZE(A1, LDA), %xmm8, %xmm8 + vmovhpd -16 * SIZE(A2, LDA), %xmm9, %xmm9 movq MM, I sarq $3, I @@ -1400,18 +1320,18 @@ VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) - shufpd $1, %xmm4, %xmm8 + vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) - shufpd $1, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) prefetchnta A_PRE(A2) - shufpd $1, %xmm6, %xmm5 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 VMOVUPS_A1(-14 * SIZE, A2, %xmm5) - shufpd $1, %xmm8, %xmm6 + vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_A1(-12 * SIZE, A2, %xmm6) @@ -1428,18 +1348,18 @@ VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) - shufpd $1, %xmm4, %xmm9 + vshufpd $0x01, %xmm4, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) - shufpd $1, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 VMOVUPS_A1( -8 * SIZE, A1, %xmm4) prefetchnta A_PRE(A1) - shufpd $1, %xmm6, %xmm5 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 VMOVUPS_A1( -6 * SIZE, A1, %xmm5) - shufpd $1, %xmm9, %xmm6 + vshufpd $0x01, %xmm9, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 VMOVUPS_A1( -4 * SIZE, A1, %xmm6) @@ -1477,17 +1397,17 @@ vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) - shufpd $1, %xmm4, %xmm8 + vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) - shufpd $1, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) - shufpd $1, %xmm6, %xmm5 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 VMOVUPS_A1(-14 * SIZE, A2, %xmm5) - shufpd $1, %xmm8, %xmm6 + vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_A1(-12 * SIZE, A2, %xmm6) @@ -1501,15 +1421,15 @@ vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) - shufpd $1, %xmm4, %xmm9 + vshufpd $0x01, %xmm4, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) - shufpd $1, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 - shufpd $1, %xmm6, %xmm5 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 - shufpd $1, %xmm9, %xmm6 + vshufpd $0x01, %xmm9, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) @@ -1538,10 +1458,10 @@ VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) - shufpd $1, %xmm6, %xmm8 + vshufpd $0x01, %xmm6, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm7, %xmm8 - shufpd $1, %xmm7, %xmm6 + vshufpd $0x01, %xmm7, %xmm6, %xmm6 vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) @@ -1553,10 +1473,10 @@ VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) - shufpd $1, %xmm6, %xmm9 + vshufpd $0x01, %xmm6, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 movaps %xmm7, %xmm9 - shufpd $1, %xmm7, %xmm6 + vshufpd $0x01, %xmm7, %xmm6, %xmm6 vfmaddpd %xmm1 , %xmm15 , %xmm6 , %xmm1 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) @@ -1579,11 +1499,11 @@ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 - shufpd $1, %xmm5, %xmm8 + vshufpd $0x01, %xmm5, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm5, %xmm8 vfmaddpd %xmm0 , %xmm14 , %xmm6 , %xmm0 - shufpd $1, %xmm7, %xmm9 + vshufpd $0x01, %xmm7, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 movaps %xmm7, %xmm9 @@ -1601,9 +1521,9 @@ vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm4 - shufpd $1, %xmm8, %xmm8 + vshufpd $0x01, %xmm8, %xmm8, %xmm8 vmovsd -16 * SIZE(A2), %xmm6 - shufpd $1, %xmm9, %xmm9 + vshufpd $0x01, %xmm9, %xmm9, %xmm9 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 @@ -1668,7 +1588,7 @@ ALIGN_3 .L6X: - movhpd -16 * SIZE(A2), %xmm8 + vmovhpd -16 * SIZE(A2), %xmm8, %xmm8 movq MM, I sarq $3, I @@ -1701,18 +1621,18 @@ VMOVUPS_A1(-11 * SIZE, A2, %xmm6) - shufpd $1, %xmm4, %xmm8 + vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A1( -9 * SIZE, A2, %xmm8) - shufpd $1, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 VMOVUPS_A1( -8 * SIZE, A1, %xmm4) prefetchnta A_PRE(A1) - shufpd $1, %xmm6, %xmm5 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 VMOVUPS_A1( -6 * SIZE, A1, %xmm5) - shufpd $1, %xmm8, %xmm6 + vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_A1( -4 * SIZE, A1, %xmm6) @@ -1749,15 +1669,15 @@ vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 VMOVUPS_A1(-11 * SIZE, A2, %xmm6) - shufpd $0x01, %xmm4, %xmm8 + vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A1( -9 * SIZE, A2, %xmm8) - shufpd $0x01, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 - shufpd $0x01, %xmm6, %xmm5 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 - shufpd $0x01, %xmm8, %xmm6 + vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) @@ -1787,10 +1707,10 @@ VMOVUPS_A1(-15 * SIZE, A2, %xmm6) VMOVUPS_A1(-13 * SIZE, A2, %xmm7) - shufpd $0x01, %xmm6, %xmm8 + vshufpd $0x01, %xmm6, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm7, %xmm8 - shufpd $0x01, %xmm7, %xmm6 + vshufpd $0x01, %xmm7, %xmm6, %xmm6 vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) @@ -1811,7 +1731,7 @@ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 - shufpd $0x01, %xmm5, %xmm8 + vshufpd $0x01, %xmm5, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm5, %xmm8 @@ -2009,7 +1929,7 @@ je .L910 vmovsd (Y), %xmm0 - addsd (BUFFER), %xmm0 + vaddsd (BUFFER), %xmm0, %xmm0 vmovsd %xmm0, (Y) addq $SIZE, Y @@ -2041,10 +1961,10 @@ vmovups 6 * SIZE(BUFFER), %xmm7 - addpd %xmm4, %xmm0 - addpd %xmm5, %xmm1 - addpd %xmm6, %xmm2 - addpd %xmm7, %xmm3 + vaddpd %xmm4, %xmm0, %xmm0 + vaddpd %xmm5, %xmm1, %xmm1 + vaddpd %xmm6, %xmm2, %xmm2 + vaddpd %xmm7, %xmm3, %xmm3 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) @@ -2071,8 +1991,8 @@ vmovups 0 * SIZE(BUFFER), %xmm4 vmovups 2 * SIZE(BUFFER), %xmm5 - addpd %xmm4, %xmm0 - addpd %xmm5, %xmm1 + vaddpd %xmm4, %xmm0, %xmm0 + vaddpd %xmm5, %xmm1, %xmm1 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) @@ -2089,7 +2009,7 @@ vmovups (BUFFER), %xmm4 - addpd %xmm4, %xmm0 + vaddpd %xmm4, %xmm0, %xmm0 vmovups %xmm0, (Y) @@ -2105,7 +2025,7 @@ vmovsd 0 * SIZE(BUFFER), %xmm4 - addsd %xmm4, %xmm0 + vaddsd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y) ALIGN_3 @@ -2133,16 +2053,16 @@ vmovups 5 * SIZE(BUFFER), %xmm7 vmovups 7 * SIZE(BUFFER), %xmm8 - shufpd $0x01, %xmm5, %xmm4 - shufpd $0x01, %xmm6, %xmm5 - shufpd $0x01, %xmm7, %xmm6 - shufpd $0x01, %xmm8, %xmm7 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 + vshufpd $0x01, %xmm7, %xmm6, %xmm6 + vshufpd $0x01, %xmm8, %xmm7, %xmm7 - addpd %xmm4, %xmm0 - addpd %xmm5, %xmm1 - addpd %xmm6, %xmm2 - addpd %xmm7, %xmm3 + vaddpd %xmm4, %xmm0, %xmm0 + vaddpd %xmm5, %xmm1, %xmm1 + vaddpd %xmm6, %xmm2, %xmm2 + vaddpd %xmm7, %xmm3, %xmm3 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) @@ -2171,11 +2091,11 @@ vmovups 1 * SIZE(BUFFER), %xmm5 vmovups 3 * SIZE(BUFFER), %xmm6 - shufpd $0x01, %xmm5, %xmm4 - shufpd $0x01, %xmm6, %xmm5 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 + vshufpd $0x01, %xmm6, %xmm5, %xmm5 - addpd %xmm4, %xmm0 - addpd %xmm5, %xmm1 + vaddpd %xmm4, %xmm0, %xmm0 + vaddpd %xmm5, %xmm1, %xmm1 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) @@ -2194,9 +2114,9 @@ vmovups 1 * SIZE(BUFFER), %xmm5 - shufpd $0x01, %xmm5, %xmm4 + vshufpd $0x01, %xmm5, %xmm4, %xmm4 - addpd %xmm4, %xmm0 + vaddpd %xmm4, %xmm0, %xmm0 vmovups %xmm0, (Y) @@ -2214,7 +2134,7 @@ vshufpd $0x01, %xmm4 ,%xmm4, %xmm4 - addsd %xmm4, %xmm0 + vaddsd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y) ALIGN_3 @@ -2227,7 +2147,7 @@ je .L960 vmovsd (Y), %xmm0 - addsd (BUFFER), %xmm0 + vaddsd (BUFFER), %xmm0, %xmm0 vmovsd %xmm0, (Y) addq INCY, Y @@ -2248,57 +2168,57 @@ .L962: vmovsd (Y), %xmm0 addq INCY, Y - movhpd (Y), %xmm0 + vmovhpd (Y), %xmm0, %xmm0 addq INCY, Y vmovups 0 * SIZE(BUFFER), %xmm4 vmovsd (Y), %xmm1 addq INCY, Y - movhpd (Y), %xmm1 + vmovhpd (Y), %xmm1, %xmm1 addq INCY, Y vmovups 2 * SIZE(BUFFER), %xmm5 vmovsd (Y), %xmm2 addq INCY, Y - movhpd (Y), %xmm2 + vmovhpd (Y), %xmm2, %xmm2 addq INCY, Y vmovups 4 * SIZE(BUFFER), %xmm6 - addpd %xmm4, %xmm0 + vaddpd %xmm4, %xmm0, %xmm0 vmovsd (Y), %xmm3 addq INCY, Y - movhpd (Y), %xmm3 + vmovhpd (Y), %xmm3, %xmm3 addq INCY, Y vmovups 6 * SIZE(BUFFER), %xmm7 - addpd %xmm5, %xmm1 + vaddpd %xmm5, %xmm1, %xmm1 vmovsd %xmm0, (Y1) addq INCY, Y1 - movhpd %xmm0, (Y1) + vmovhpd %xmm0, (Y1) addq INCY, Y1 - addpd %xmm6, %xmm2 + vaddpd %xmm6, %xmm2, %xmm2 vmovsd %xmm1, (Y1) addq INCY, Y1 - movhpd %xmm1, (Y1) + vmovhpd %xmm1, (Y1) addq INCY, Y1 - addpd %xmm7, %xmm3 + vaddpd %xmm7, %xmm3, %xmm3 vmovsd %xmm2, (Y1) addq INCY, Y1 - movhpd %xmm2, (Y1) + vmovhpd %xmm2, (Y1) addq INCY, Y1 vmovsd %xmm3, (Y1) addq INCY, Y1 - movhpd %xmm3, (Y1) + vmovhpd %xmm3, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER @@ -2315,28 +2235,28 @@ vmovsd (Y), %xmm0 addq INCY, Y - movhpd (Y), %xmm0 + vmovhpd (Y), %xmm0, %xmm0 addq INCY, Y vmovups 0 * SIZE(BUFFER), %xmm4 vmovsd (Y), %xmm1 addq INCY, Y - movhpd (Y), %xmm1 + vmovhpd (Y), %xmm1, %xmm1 addq INCY, Y vmovups 2 * SIZE(BUFFER), %xmm5 - addpd %xmm4, %xmm0 - addpd %xmm5, %xmm1 + vaddpd %xmm4, %xmm0, %xmm0 + vaddpd %xmm5, %xmm1, %xmm1 vmovsd %xmm0, (Y1) addq INCY, Y1 - movhpd %xmm0, (Y1) + vmovhpd %xmm0, (Y1) addq INCY, Y1 vmovsd %xmm1, (Y1) addq INCY, Y1 - movhpd %xmm1, (Y1) + vmovhpd %xmm1, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER @@ -2348,16 +2268,16 @@ vmovsd (Y), %xmm0 addq INCY, Y - movhpd (Y), %xmm0 + vmovhpd (Y),%xmm0, %xmm0 addq INCY, Y vmovups 0 * SIZE(BUFFER), %xmm4 - addpd %xmm4, %xmm0 + vaddpd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y1) addq INCY, Y1 - movhpd %xmm0, (Y1) + vmovhpd %xmm0, (Y1) addq INCY, Y1 addq $2 * SIZE, BUFFER @@ -2371,7 +2291,7 @@ vmovsd 0 * SIZE(BUFFER), %xmm4 - addsd %xmm4, %xmm0 + vaddsd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y1) ALIGN_3 From 9e58dd509e706a3921416e049944fdb5abc43efa Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 17 Jun 2013 12:55:12 +0200 Subject: [PATCH 3/6] added gemm_ncopy_2_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 4 +- kernel/x86_64/gemm_ncopy_2_bulldozer.S | 360 +++++++++++++++++++++++++ 2 files changed, 362 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/gemm_ncopy_2_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 8261bf42f..1a0395a0f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -6,7 +6,7 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) @@ -15,7 +15,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/gemm_ncopy_2_bulldozer.S b/kernel/x86_64/gemm_ncopy_2_bulldozer.S new file mode 100644 index 000000000..02d72f009 --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_2_bulldozer.S @@ -0,0 +1,360 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L01: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $3, I + jle .L08 + ALIGN_4 + +.L03: + +#ifndef DOUBLE + vmovss 0 * SIZE(AO1), %xmm0 + vmovss 0 * SIZE(AO2), %xmm1 + vmovss 1 * SIZE(AO1), %xmm2 + vmovss 1 * SIZE(AO2), %xmm3 + vmovss 2 * SIZE(AO1), %xmm4 + vmovss 2 * SIZE(AO2), %xmm5 + vmovss 3 * SIZE(AO1), %xmm6 + vmovss 3 * SIZE(AO2), %xmm7 + + vmovss 4 * SIZE(AO1), %xmm8 + vmovss 4 * SIZE(AO2), %xmm9 + vmovss 5 * SIZE(AO1), %xmm10 + vmovss 5 * SIZE(AO2), %xmm11 + vmovss 6 * SIZE(AO1), %xmm12 + vmovss 6 * SIZE(AO2), %xmm13 + vmovss 7 * SIZE(AO1), %xmm14 + vmovss 7 * SIZE(AO2), %xmm15 + + vmovss %xmm0, 0 * SIZE(B) + vmovss %xmm1, 1 * SIZE(B) + vmovss %xmm2, 2 * SIZE(B) + vmovss %xmm3, 3 * SIZE(B) + vmovss %xmm4, 4 * SIZE(B) + vmovss %xmm5, 5 * SIZE(B) + vmovss %xmm6, 6 * SIZE(B) + vmovss %xmm7, 7 * SIZE(B) + + vmovss %xmm8, 8 * SIZE(B) + vmovss %xmm9, 9 * SIZE(B) + vmovss %xmm10, 10 * SIZE(B) + vmovss %xmm11, 11 * SIZE(B) + vmovss %xmm12, 12 * SIZE(B) + vmovss %xmm13, 13 * SIZE(B) + vmovss %xmm14, 14 * SIZE(B) + vmovss %xmm15, 15 * SIZE(B) + +#else + prefetchw 256(B) + + prefetchnta 256(AO1) + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 1 * SIZE(AO1), %xmm1 + vmovsd 2 * SIZE(AO1), %xmm2 + vmovsd 3 * SIZE(AO1), %xmm3 + vmovsd 4 * SIZE(AO1), %xmm4 + vmovsd 5 * SIZE(AO1), %xmm5 + vmovsd 6 * SIZE(AO1), %xmm6 + vmovsd 7 * SIZE(AO1), %xmm7 + + prefetchnta 256(AO2) + vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 + vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1 + vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2 + vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3 + vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4 + vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5 + vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6 + vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7 + + + prefetchw 256+64(B) + vmovups %xmm0, 0 * SIZE(B) + vmovups %xmm1, 2 * SIZE(B) + vmovups %xmm2, 4 * SIZE(B) + vmovups %xmm3, 6 * SIZE(B) + vmovups %xmm4, 8 * SIZE(B) + vmovups %xmm5, 10 * SIZE(B) + vmovups %xmm6, 12 * SIZE(B) + vmovups %xmm7, 14 * SIZE(B) + +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + decq I + jg .L03 + ALIGN_4 + + +.L08: + testq $4 , M + je .L14 + + ALIGN_4 + + +.L13: +#ifndef DOUBLE + vmovss 0 * SIZE(AO1), %xmm0 + vmovss 0 * SIZE(AO2), %xmm1 + vmovss 1 * SIZE(AO1), %xmm2 + vmovss 1 * SIZE(AO2), %xmm3 + vmovss 2 * SIZE(AO1), %xmm4 + vmovss 2 * SIZE(AO2), %xmm5 + vmovss 3 * SIZE(AO1), %xmm6 + vmovss 3 * SIZE(AO2), %xmm7 + + vmovss %xmm0, 0 * SIZE(B) + vmovss %xmm1, 1 * SIZE(B) + vmovss %xmm2, 2 * SIZE(B) + vmovss %xmm3, 3 * SIZE(B) + vmovss %xmm4, 4 * SIZE(B) + vmovss %xmm5, 5 * SIZE(B) + vmovss %xmm6, 6 * SIZE(B) + vmovss %xmm7, 7 * SIZE(B) +#else + + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 1 * SIZE(AO1), %xmm1 + vmovsd 2 * SIZE(AO1), %xmm2 + vmovsd 3 * SIZE(AO1), %xmm3 + + vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 + vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1 + vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2 + vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3 + + + vmovups %xmm0, 0 * SIZE(B) + vmovups %xmm1, 2 * SIZE(B) + vmovups %xmm2, 4 * SIZE(B) + vmovups %xmm3, 6 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + vmovss 0 * SIZE(AO1), %xmm0 + vmovss 0 * SIZE(AO2), %xmm1 + + vmovss %xmm0, 0 * SIZE(B) + vmovss %xmm1, 1 * SIZE(B) +#else + vmovsd 0 * SIZE(AO1), %xmm0 + vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 + + vmovups %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L01 + ALIGN_4 + +.L20: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: +#ifndef DOUBLE + vmovups 0 * SIZE(AO1), %xmm0 + + vmovups %xmm0, 0 * SIZE(B) +#else + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm1 + + vmovups %xmm0, 0 * SIZE(B) + vmovups %xmm1, 2 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: +#ifndef DOUBLE + vmovss 0 * SIZE(AO1), %xmm0 + vmovss %xmm0, 0 * SIZE(B) +#else + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + vmovups 112(%rsp), %xmm13 + vmovups 128(%rsp), %xmm14 + vmovups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE From d0b6299b136b43ea87a61b3e6836b0b82275229f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 17 Jun 2013 14:19:09 +0200 Subject: [PATCH 4/6] added dgemm_tcopy_8_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/dgemm_tcopy_8_bulldozer.S | 667 ++++++++++++++++++++++++ 2 files changed, 668 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_tcopy_8_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 1a0395a0f..1bd8073c3 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -14,7 +14,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S new file mode 100644 index 000000000..e62b9da4a --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S @@ -0,0 +1,667 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS +#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS + +#define A_PRE 256 + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 56(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax + +#define B0 %rbp +#define B1 %r13 +#define B2 %r14 +#define B3 %r15 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B1 + movq M, B2 + movq M, B3 + + andq $-8, B1 + andq $-4, B2 + andq $-2, B3 + + imulq N, B1 + imulq N, B2 + imulq N, B3 + + leaq (B, B1, SIZE), B1 + leaq (B, B2, SIZE), B2 + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $8, N + jl .L20 + ALIGN_4 + +.L11: + subq $8, N + + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + movq B, B0 + addq $64 * SIZE, B + + movq M, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: + + prefetchnta A_PRE(AO1) + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A1(4 * SIZE, AO1, %xmm2) + VMOVUPS_A1(6 * SIZE, AO1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B0) + vmovups %xmm1, -14 * SIZE(B0) + vmovups %xmm2, -12 * SIZE(B0) + vmovups %xmm3, -10 * SIZE(B0) + + + prefetchnta A_PRE(AO1, LDA, 1) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + + vmovups %xmm0, -8 * SIZE(B0) + vmovups %xmm1, -6 * SIZE(B0) + vmovups %xmm2, -4 * SIZE(B0) + vmovups %xmm3, -2 * SIZE(B0) + + + prefetchnta A_PRE(AO1, LDA, 2) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) + VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) + VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2) + VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3) + + + vmovups %xmm0, 0 * SIZE(B0) + vmovups %xmm1, 2 * SIZE(B0) + vmovups %xmm2, 4 * SIZE(B0) + vmovups %xmm3, 6 * SIZE(B0) + + + prefetchnta A_PRE(AO1, LDA3, 1) + VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0) + VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1) + VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2) + VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3) + + vmovups %xmm0, 8 * SIZE(B0) + vmovups %xmm1, 10 * SIZE(B0) + vmovups %xmm2, 12 * SIZE(B0) + vmovups %xmm3, 14 * SIZE(B0) + + prefetchnta A_PRE(AO2) + VMOVUPS_A1(0 * SIZE, AO2, %xmm0) + VMOVUPS_A1(2 * SIZE, AO2, %xmm1) + VMOVUPS_A1(4 * SIZE, AO2, %xmm2) + VMOVUPS_A1(6 * SIZE, AO2, %xmm3) + + vmovups %xmm0, 16 * SIZE(B0) + vmovups %xmm1, 18 * SIZE(B0) + vmovups %xmm2, 20 * SIZE(B0) + vmovups %xmm3, 22 * SIZE(B0) + + prefetchnta A_PRE(AO2, LDA, 1) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + + vmovups %xmm0, 24 * SIZE(B0) + vmovups %xmm1, 26 * SIZE(B0) + vmovups %xmm2, 28 * SIZE(B0) + vmovups %xmm3, 30 * SIZE(B0) + + prefetchnta A_PRE(AO2, LDA, 2) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) + VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) + VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2) + VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3) + + vmovups %xmm0, 32 * SIZE(B0) + vmovups %xmm1, 34 * SIZE(B0) + vmovups %xmm2, 36 * SIZE(B0) + vmovups %xmm3, 38 * SIZE(B0) + + prefetchnta A_PRE(AO2, LDA3, 1) + VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0) + VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1) + VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2) + VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3) + + vmovups %xmm0, 40 * SIZE(B0) + vmovups %xmm1, 42 * SIZE(B0) + vmovups %xmm2, 44 * SIZE(B0) + vmovups %xmm3, 46 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, M + jle .L16 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B1) + vmovups %xmm1, -14 * SIZE(B1) + vmovups %xmm2, -12 * SIZE(B1) + vmovups %xmm3, -10 * SIZE(B1) + + VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) + VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) + VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2) + VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3) + + vmovups %xmm0, -8 * SIZE(B1) + vmovups %xmm1, -6 * SIZE(B1) + vmovups %xmm2, -4 * SIZE(B1) + vmovups %xmm3, -2 * SIZE(B1) + + VMOVUPS_A1(0 * SIZE, AO2, %xmm0) + VMOVUPS_A1(2 * SIZE, AO2, %xmm1) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + vmovups %xmm0, 0 * SIZE(B1) + vmovups %xmm1, 2 * SIZE(B1) + vmovups %xmm2, 4 * SIZE(B1) + vmovups %xmm3, 6 * SIZE(B1) + + VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) + VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) + VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2) + VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3) + + vmovups %xmm0, 8 * SIZE(B1) + vmovups %xmm1, 10 * SIZE(B1) + vmovups %xmm2, 12 * SIZE(B1) + vmovups %xmm3, 14 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B1 + ALIGN_4 + +.L16: + testq $2, M + jle .L18 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2) + VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B2) + vmovups %xmm1, -14 * SIZE(B2) + vmovups %xmm2, -12 * SIZE(B2) + vmovups %xmm3, -10 * SIZE(B2) + + VMOVUPS_A1(0 * SIZE, AO2, %xmm0) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2) + VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3) + + vmovups %xmm0, -8 * SIZE(B2) + vmovups %xmm1, -6 * SIZE(B2) + vmovups %xmm2, -4 * SIZE(B2) + vmovups %xmm3, -2 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B2 + ALIGN_4 + +.L18: + testq $1, M + jle .L19 + + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovsd 0 * SIZE(AO1, LDA3), %xmm3 + + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 + + vmovups %xmm0, -16 * SIZE(B3) + vmovups %xmm2, -14 * SIZE(B3) + + vmovsd 0 * SIZE(AO2), %xmm0 + vmovsd 0 * SIZE(AO2, LDA), %xmm1 + vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2 + vmovsd 0 * SIZE(AO2, LDA3), %xmm3 + + vunpcklpd %xmm1, %xmm0 , %xmm0 + vunpcklpd %xmm3, %xmm2 , %xmm2 + + vmovups %xmm0, -12 * SIZE(B3) + vmovups %xmm2, -10 * SIZE(B3) + + subq $-8 * SIZE, B3 + ALIGN_4 + +.L19: + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: + cmpq $4, N + jl .L30 + + subq $4, N + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + movq B, B0 + addq $32 * SIZE, B + + movq M, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A1(4 * SIZE, AO1, %xmm2) + VMOVUPS_A1(6 * SIZE, AO1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B0) + vmovups %xmm1, -14 * SIZE(B0) + vmovups %xmm2, -12 * SIZE(B0) + vmovups %xmm3, -10 * SIZE(B0) + + + VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + + vmovups %xmm0, -8 * SIZE(B0) + vmovups %xmm1, -6 * SIZE(B0) + vmovups %xmm2, -4 * SIZE(B0) + vmovups %xmm3, -2 * SIZE(B0) + + VMOVUPS_A1(0 * SIZE, AO2, %xmm0) + VMOVUPS_A1(2 * SIZE, AO2, %xmm1) + VMOVUPS_A1(4 * SIZE, AO2, %xmm2) + VMOVUPS_A1(6 * SIZE, AO2, %xmm3) + + vmovups %xmm0, 0 * SIZE(B0) + vmovups %xmm1, 2 * SIZE(B0) + vmovups %xmm2, 4 * SIZE(B0) + vmovups %xmm3, 6 * SIZE(B0) + + VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + + vmovups %xmm0, 8 * SIZE(B0) + vmovups %xmm1, 10 * SIZE(B0) + vmovups %xmm2, 12 * SIZE(B0) + vmovups %xmm3, 14 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, M + jle .L26 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B1) + vmovups %xmm1, -14 * SIZE(B1) + vmovups %xmm2, -12 * SIZE(B1) + vmovups %xmm3, -10 * SIZE(B1) + + VMOVUPS_A1(0 * SIZE, AO2, %xmm0) + VMOVUPS_A1(2 * SIZE, AO2, %xmm1) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + vmovups %xmm0, -8 * SIZE(B1) + vmovups %xmm1, -6 * SIZE(B1) + vmovups %xmm2, -4 * SIZE(B1) + vmovups %xmm3, -2 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B1 + ALIGN_4 + +.L26: + testq $2, M + jle .L28 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + VMOVUPS_A1(0 * SIZE, AO2, %xmm2) + VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B2) + vmovups %xmm1, -14 * SIZE(B2) + vmovups %xmm2, -12 * SIZE(B2) + vmovups %xmm3, -10 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B2 + ALIGN_4 + +.L28: + testq $1, M + jle .L30 + + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO1, LDA), %xmm1 + vmovsd 0 * SIZE(AO2), %xmm2 + vmovsd 0 * SIZE(AO2, LDA), %xmm3 + + vunpcklpd %xmm1, %xmm0, %xmm0 + vunpcklpd %xmm3, %xmm2, %xmm2 + + vmovups %xmm0, -16 * SIZE(B3) + vmovups %xmm2, -14 * SIZE(B3) + subq $-4 * SIZE, B3 + ALIGN_4 + +.L30: + cmpq $2, N + jl .L40 + + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $16 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A1(4 * SIZE, AO1, %xmm2) + VMOVUPS_A1(6 * SIZE, AO1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B0) + vmovups %xmm1, -14 * SIZE(B0) + vmovups %xmm2, -12 * SIZE(B0) + vmovups %xmm3, -10 * SIZE(B0) + + VMOVUPS_A1(0 * SIZE, AO2, %xmm0) + VMOVUPS_A1(2 * SIZE, AO2, %xmm1) + VMOVUPS_A1(4 * SIZE, AO2, %xmm2) + VMOVUPS_A1(6 * SIZE, AO2, %xmm3) + + vmovups %xmm0, -8 * SIZE(B0) + vmovups %xmm1, -6 * SIZE(B0) + vmovups %xmm2, -4 * SIZE(B0) + vmovups %xmm3, -2 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A1(0 * SIZE, AO2, %xmm2) + VMOVUPS_A1(2 * SIZE, AO2, %xmm3) + + vmovups %xmm0, -16 * SIZE(B1) + vmovups %xmm1, -14 * SIZE(B1) + vmovups %xmm2, -12 * SIZE(B1) + vmovups %xmm3, -10 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B1 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(0 * SIZE, AO2, %xmm1) + + vmovups %xmm0, -16 * SIZE(B2) + vmovups %xmm1, -14 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B2 + ALIGN_4 + +.L38: + testq $1, M + jle .L40 + + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO2), %xmm1 + + vunpcklpd %xmm1, %xmm0, %xmm0 + + vmovups %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + VMOVUPS_A1(4 * SIZE, AO1, %xmm2) + VMOVUPS_A1(6 * SIZE, AO1, %xmm3) + + vmovups %xmm0, -16 * SIZE(B0) + vmovups %xmm1, -14 * SIZE(B0) + vmovups %xmm2, -12 * SIZE(B0) + vmovups %xmm3, -10 * SIZE(B0) + + addq $8 * SIZE, AO1 + leaq (B0, M8, 8), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + VMOVUPS_A1(2 * SIZE, AO1, %xmm1) + + vmovups %xmm0, -16 * SIZE(B1) + vmovups %xmm1, -14 * SIZE(B1) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B1 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + VMOVUPS_A1(0 * SIZE, AO1, %xmm0) + + vmovups %xmm0, -16 * SIZE(B2) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B2 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + vmovsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + jmp .L999 + ALIGN_4 + +.L999: + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE From a135f5d9ed3ce118bd0f9ddee8f920864756d7df Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 18 Jun 2013 11:01:33 +0200 Subject: [PATCH 5/6] added gemm_tcopy_2_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 4 +- kernel/x86_64/dgemm_tcopy_8_bulldozer.S | 2 +- kernel/x86_64/gemm_tcopy_2_bulldozer.S | 374 ++++++++++++++++++++++++ 3 files changed, 377 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/gemm_tcopy_2_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 1bd8073c3..7732e77fc 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -7,7 +7,7 @@ SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) @@ -16,7 +16,7 @@ DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S DGEMMONCOPY = gemm_ncopy_2_bulldozer.S -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S index e62b9da4a..d7fc416d9 100644 --- a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S +++ b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S @@ -647,7 +647,7 @@ vmovsd 0 * SIZE(AO1), %xmm0 - movlpd %xmm0, -16 * SIZE(B3) + vmovsd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 diff --git a/kernel/x86_64/gemm_tcopy_2_bulldozer.S b/kernel/x86_64/gemm_tcopy_2_bulldozer.S new file mode 100644 index 000000000..b8d61b0ae --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_2_bulldozer.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define M8 %rbp +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + vmovups %xmm13, 112(%rsp) + vmovups %xmm14, 128(%rsp) + vmovups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + andq $-2, %rax + imulq M, %rax + + leaq (B, %rax, SIZE), BO1 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + + movq M, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L01: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $4 * SIZE, B + + movq N, I + sarq $3, I + jle .L10 + ALIGN_4 + + +.L08: +#ifndef DOUBLE + + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 2 * SIZE(AO1), %xmm2 + vmovsd 4 * SIZE(AO1), %xmm4 + vmovsd 6 * SIZE(AO1), %xmm6 + vmovsd 0 * SIZE(AO2), %xmm1 + vmovsd 2 * SIZE(AO2), %xmm3 + vmovsd 4 * SIZE(AO2), %xmm5 + vmovsd 6 * SIZE(AO2), %xmm7 + + vmovsd %xmm0, 0 * SIZE(BO) + vmovsd %xmm1, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovsd %xmm2, 0 * SIZE(BO) + vmovsd %xmm3, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovsd %xmm4, 0 * SIZE(BO) + vmovsd %xmm5, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovsd %xmm6, 0 * SIZE(BO) + vmovsd %xmm7, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + +#else + + prefetchnta 256(AO1) + prefetchnta 256(AO2) + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 4 * SIZE(AO1), %xmm4 + vmovups 6 * SIZE(AO1), %xmm6 + vmovups 0 * SIZE(AO2), %xmm1 + vmovups 2 * SIZE(AO2), %xmm3 + vmovups 4 * SIZE(AO2), %xmm5 + vmovups 6 * SIZE(AO2), %xmm7 + + vmovups %xmm0, 0 * SIZE(BO) + vmovups %xmm1, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovups %xmm2, 0 * SIZE(BO) + vmovups %xmm3, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovups %xmm4, 0 * SIZE(BO) + vmovups %xmm5, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovups %xmm6, 0 * SIZE(BO) + vmovups %xmm7, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + decq I + jg .L08 + ALIGN_4 + + + +.L10: + testq $4, N + jle .L12 +#ifndef DOUBLE + + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 2 * SIZE(AO1), %xmm2 + vmovsd 0 * SIZE(AO2), %xmm1 + vmovsd 2 * SIZE(AO2), %xmm3 + + vmovsd %xmm0, 0 * SIZE(BO) + vmovsd %xmm1, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovsd %xmm2, 0 * SIZE(BO) + vmovsd %xmm3, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + +#else + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 0 * SIZE(AO2), %xmm1 + vmovups 2 * SIZE(AO2), %xmm3 + + vmovups %xmm0, 0 * SIZE(BO) + vmovups %xmm1, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + + vmovups %xmm2, 0 * SIZE(BO) + vmovups %xmm3, 2 * SIZE(BO) + leaq (BO, M8, 2), BO + +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + ALIGN_4 + + +.L12: + testq $2, N + jle .L14 +#ifndef DOUBLE + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd 0 * SIZE(AO2), %xmm1 + + vmovsd %xmm0, 0 * SIZE(BO) + vmovsd %xmm1, 2 * SIZE(BO) +#else + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO2), %xmm1 + + vmovups %xmm0, 0 * SIZE(BO) + vmovups %xmm1, 2 * SIZE(BO) +#endif + + leaq (BO, M8, 2), BO + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + vmovss 0 * SIZE(AO1), %xmm0 + vmovss 0 * SIZE(AO2), %xmm1 + + vmovss %xmm0, 0 * SIZE(BO1) + vmovss %xmm1, 1 * SIZE(BO1) +#else + vmovsd 0 * SIZE(AO1), %xmm0 + vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 + + vmovups %xmm0, 0 * SIZE(BO1) +#endif + + addq $2 * SIZE, BO1 + ALIGN_4 + +.L19: + decq J + jg .L01 + ALIGN_4 + +.L20: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $1, I + jle .L33 + ALIGN_4 + +.L32: +#ifndef DOUBLE + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd %xmm0, 0 * SIZE(BO) +#else + vmovups 0 * SIZE(AO1), %xmm0 + vmovups %xmm0, 0 * SIZE(BO) +#endif + + addq $2 * SIZE, AO1 + leaq (BO, M8, 2), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $1, N + jle .L999 + +#ifndef DOUBLE + vmovss 0 * SIZE(AO1), %xmm0 + vmovss %xmm0, 0 * SIZE(BO1) +#else + vmovsd 0 * SIZE(AO1), %xmm0 + vmovsd %xmm0, 0 * SIZE(BO1) +#endif + addq $1 * SIZE, BO1 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + vmovups 112(%rsp), %xmm13 + vmovups 128(%rsp), %xmm14 + vmovups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + + ret + + EPILOGUE From 93dbbe1fb884236ca28d7a7576ea0d6967ed3b12 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 18 Jun 2013 13:29:23 +0200 Subject: [PATCH 6/6] added dgemm_ncopy_8_bulldozer.S --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/dgemm_ncopy_8_bulldozer.S | 1823 +++++++++++++++++++++++ 2 files changed, 1824 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_ncopy_8_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 7732e77fc..59ae72ce2 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -13,7 +13,7 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S diff --git a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S new file mode 100644 index 000000000..26a14b76a --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S @@ -0,0 +1,1823 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define A_PRE 256 +#define B_PRE 128 + + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define J %r12 +#define MM %r13 + +#else + +#define STACKSIZE 128 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r15 + +#define AO1 %r10 +#define AO2 %r11 +#define LDA3 %r12 +#define J %r13 +#define MM %r14 + +#endif + +#define I %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + vmovups %xmm6, 0(%rsp) + vmovups %xmm7, 16(%rsp) + vmovups %xmm8, 32(%rsp) + vmovups %xmm9, 48(%rsp) + vmovups %xmm10, 64(%rsp) + vmovups %xmm11, 80(%rsp) + vmovups %xmm12, 96(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $3, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + testq $SIZE, A + je .L12 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_3 + +.L12: + movq MM, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: + + prefetchnta A_PRE(AO1) + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO1, LDA), %xmm1 + prefetchnta A_PRE(AO1, LDA) + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 0 * SIZE(AO1, LDA3), %xmm3 + + prefetchnta A_PRE(AO1, LDA, 2) + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups %xmm0, %xmm8 + prefetchnta A_PRE(AO1, LDA3) + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + + prefetchnta A_PRE(AO2) + vmovups 0 * SIZE(AO2), %xmm4 + vmovups 0 * SIZE(AO2, LDA), %xmm5 + prefetchnta A_PRE(AO2, LDA) + vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 0 * SIZE(AO2, LDA3), %xmm7 + + prefetchnta A_PRE(AO2, LDA, 2) + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + prefetchnta A_PRE(AO2, LDA3) + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + prefetchw B_PRE(B) + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + + prefetchw B_PRE+64(B) + vmovups %xmm8, -8 * SIZE(B) + vmovups %xmm9, -6 * SIZE(B) + vmovups %xmm10, -4 * SIZE(B) + vmovups %xmm11, -2 * SIZE(B) + +/***********************************************************************************************/ + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1, LDA), %xmm1 + vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 2 * SIZE(AO1, LDA3), %xmm3 + + vmovups %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + + vmovups 2 * SIZE(AO2), %xmm4 + vmovups 2 * SIZE(AO2, LDA), %xmm5 + vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 2 * SIZE(AO2, LDA3), %xmm7 + + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + + prefetchw B_PRE+128(B) + vmovups %xmm0, 0 * SIZE(B) + vmovups %xmm2, 2 * SIZE(B) + vmovups %xmm4, 4 * SIZE(B) + vmovups %xmm6, 6 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + + prefetchw B_PRE+192(B) + vmovups %xmm8, 8 * SIZE(B) + vmovups %xmm9, 10 * SIZE(B) + vmovups %xmm10, 12 * SIZE(B) + vmovups %xmm11, 14 * SIZE(B) + +/***********************************************************************************************/ + + vmovups 4 * SIZE(AO1), %xmm0 + vmovups 4 * SIZE(AO1, LDA), %xmm1 + vmovups 4 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 4 * SIZE(AO1, LDA3), %xmm3 + + vmovups %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + + vmovups 4 * SIZE(AO2), %xmm4 + vmovups 4 * SIZE(AO2, LDA), %xmm5 + vmovups 4 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 4 * SIZE(AO2, LDA3), %xmm7 + + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + + prefetchw B_PRE+256(B) + vmovups %xmm0, 16 * SIZE(B) + vmovups %xmm2, 18 * SIZE(B) + vmovups %xmm4, 20 * SIZE(B) + vmovups %xmm6, 22 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + + prefetchw B_PRE+320(B) + vmovups %xmm8, 24 * SIZE(B) + vmovups %xmm9, 26 * SIZE(B) + vmovups %xmm10, 28 * SIZE(B) + vmovups %xmm11, 30 * SIZE(B) + +/***********************************************************************************************/ + + vmovups 6 * SIZE(AO1), %xmm0 + vmovups 6 * SIZE(AO1, LDA), %xmm1 + vmovups 6 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 6 * SIZE(AO1, LDA3), %xmm3 + + vmovups %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + + vmovups 6 * SIZE(AO2), %xmm4 + vmovups 6 * SIZE(AO2, LDA), %xmm5 + vmovups 6 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 6 * SIZE(AO2, LDA3), %xmm7 + + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + + prefetchw B_PRE+384(B) + vmovups %xmm0, 32 * SIZE(B) + vmovups %xmm2, 34 * SIZE(B) + vmovups %xmm4, 36 * SIZE(B) + vmovups %xmm6, 38 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + + prefetchw B_PRE+448(B) + vmovups %xmm8, 40 * SIZE(B) + vmovups %xmm9, 42 * SIZE(B) + vmovups %xmm10, 44 * SIZE(B) + vmovups %xmm11, 46 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-64 * SIZE, B + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, MM + jle .L16 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 0 * SIZE(AO1, LDA3), %xmm3 + + vmovups 0 * SIZE(AO2), %xmm4 + vmovups 0 * SIZE(AO2, LDA), %xmm5 + vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 0 * SIZE(AO2, LDA3), %xmm7 + + vmovups %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + vmovups %xmm8, -8 * SIZE(B) + vmovups %xmm9, -6 * SIZE(B) + vmovups %xmm10, -4 * SIZE(B) + vmovups %xmm11, -2 * SIZE(B) + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1, LDA), %xmm1 + vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 2 * SIZE(AO1, LDA3), %xmm3 + + vmovups 2 * SIZE(AO2), %xmm4 + vmovups 2 * SIZE(AO2, LDA), %xmm5 + vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 2 * SIZE(AO2, LDA3), %xmm7 + + vmovups %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, 0 * SIZE(B) + vmovups %xmm2, 2 * SIZE(B) + vmovups %xmm4, 4 * SIZE(B) + vmovups %xmm6, 6 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + vmovups %xmm8, 8 * SIZE(B) + vmovups %xmm9, 10 * SIZE(B) + vmovups %xmm10, 12 * SIZE(B) + vmovups %xmm11, 14 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B + ALIGN_4 + +.L16: + testq $2, MM + jle .L18 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 0 * SIZE(AO1, LDA3), %xmm3 + + vmovups 0 * SIZE(AO2), %xmm4 + vmovups 0 * SIZE(AO2, LDA), %xmm5 + vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 0 * SIZE(AO2, LDA3), %xmm7 + + vmovups %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + vmovups %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + vmovups %xmm8, -8 * SIZE(B) + vmovups %xmm9, -6 * SIZE(B) + vmovups %xmm10, -4 * SIZE(B) + vmovups %xmm11, -2 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + subq $-8 * SIZE, B + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $4, N + jle .L30 + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO2), %xmm2 + vmovups 0 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1, LDA), %xmm1 + vmovups 2 * SIZE(AO2), %xmm2 + vmovups 2 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + + vmovups %xmm0, -8 * SIZE(B) + vmovups %xmm2, -6 * SIZE(B) + vmovups %xmm4, -4 * SIZE(B) + vmovups %xmm6, -2 * SIZE(B) + + + vmovups 4 * SIZE(AO1), %xmm0 + vmovups 4 * SIZE(AO1, LDA), %xmm1 + vmovups 4 * SIZE(AO2), %xmm2 + vmovups 4 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + + vmovups %xmm0, 0 * SIZE(B) + vmovups %xmm2, 2 * SIZE(B) + vmovups %xmm4, 4 * SIZE(B) + vmovups %xmm6, 6 * SIZE(B) + + + vmovups 6 * SIZE(AO1), %xmm0 + vmovups 6 * SIZE(AO1, LDA), %xmm1 + vmovups 6 * SIZE(AO2), %xmm2 + vmovups 6 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + + vmovups %xmm0, 8 * SIZE(B) + vmovups %xmm2, 10 * SIZE(B) + vmovups %xmm4, 12 * SIZE(B) + vmovups %xmm6, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO2), %xmm2 + vmovups 0 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1, LDA), %xmm1 + vmovups 2 * SIZE(AO2), %xmm2 + vmovups 2 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + vmovups %xmm0, -8 * SIZE(B) + vmovups %xmm2, -6 * SIZE(B) + vmovups %xmm4, -4 * SIZE(B) + vmovups %xmm6, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO2), %xmm2 + vmovups 0 * SIZE(AO2, LDA), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L30: + testq $2, N + jle .L40 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L32 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L32: + movq MM, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO2), %xmm1 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 2 * SIZE(AO2), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm4, -14 * SIZE(B) + vmovups %xmm2, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + + vmovups 4 * SIZE(AO1), %xmm0 + vmovups 4 * SIZE(AO2), %xmm1 + vmovups 6 * SIZE(AO1), %xmm2 + vmovups 6 * SIZE(AO2), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + + vmovups %xmm0, -8 * SIZE(B) + vmovups %xmm4, -6 * SIZE(B) + vmovups %xmm2, -4 * SIZE(B) + vmovups %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, MM + jle .L36 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO2), %xmm1 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 2 * SIZE(AO2), %xmm3 + + vmovups %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + vmovups %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm4, -14 * SIZE(B) + vmovups %xmm2, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L36: + testq $2, MM + jle .L38 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 0 * SIZE(AO2), %xmm1 + + vmovups %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L38: + testq $1, MM + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L40: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L45 + + movq MM, I + sarq $3, I + jle .L42 + ALIGN_4 + +.L41: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm1 + vmovups 4 * SIZE(AO1), %xmm2 + vmovups 6 * SIZE(AO1), %xmm3 + + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm1, -14 * SIZE(B) + vmovups %xmm2, -12 * SIZE(B) + vmovups %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L41 + ALIGN_4 + +.L42: + testq $4, MM + jle .L43 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm1 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L43: + testq $2, MM + jle .L44 + + vmovups 0 * SIZE(AO1), %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L44: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L45: + vmovups -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L46 + ALIGN_4 + +.L46: + + vmovups 1 * SIZE(AO1), %xmm1 + vmovups 3 * SIZE(AO1), %xmm2 + vmovups 5 * SIZE(AO1), %xmm3 + vmovups 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm1, -14 * SIZE(B) + vmovups %xmm2, -12 * SIZE(B) + vmovups %xmm3, -10 * SIZE(B) + + vmovups %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L46 + ALIGN_4 + +.L47: + testq $4, M + jle .L48 + + vmovups 1 * SIZE(AO1), %xmm1 + vmovups 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm1, -14 * SIZE(B) + + vmovups %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L48: + testq $2, M + jle .L49 + + vmovups 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + + vmovups %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L49: + testq $1, M + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $3, J + jle .L60 + ALIGN_4 + +.L51: + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + testq $SIZE, A + je .L52 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_3 + +.L52: + vmovups -1 * SIZE(AO1, LDA), %xmm9 + vmovups -1 * SIZE(AO1, LDA3), %xmm10 + vmovups -1 * SIZE(AO2, LDA), %xmm11 + vmovups -1 * SIZE(AO2, LDA3), %xmm12 + + movq MM, I + sarq $3, I + jle .L54 + ALIGN_4 + +.L53: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 1 * SIZE(AO1, LDA3), %xmm3 + + + vmovups 0 * SIZE(AO2), %xmm4 + vmovups 1 * SIZE(AO2, LDA), %xmm5 + vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + + vmovups %xmm9, -16 * SIZE(B) + vmovups %xmm10, -14 * SIZE(B) + vmovups %xmm11, -12 * SIZE(B) + vmovups %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + + vmovups %xmm0, -8 * SIZE(B) + vmovups %xmm2, -6 * SIZE(B) + vmovups %xmm4, -4 * SIZE(B) + vmovups %xmm6, -2 * SIZE(B) + + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 3 * SIZE(AO1, LDA), %xmm9 + vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 3 * SIZE(AO1, LDA3), %xmm10 + + + vmovups 2 * SIZE(AO2), %xmm4 + vmovups 3 * SIZE(AO2, LDA), %xmm11 + vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 3 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + + + vmovups %xmm1, 0 * SIZE(B) + vmovups %xmm3, 2 * SIZE(B) + vmovups %xmm5, 4 * SIZE(B) + vmovups %xmm7, 6 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + + + vmovups %xmm0, 8 * SIZE(B) + vmovups %xmm2, 10 * SIZE(B) + vmovups %xmm4, 12 * SIZE(B) + vmovups %xmm6, 14 * SIZE(B) + + + vmovups 4 * SIZE(AO1), %xmm0 + vmovups 5 * SIZE(AO1, LDA), %xmm1 + vmovups 4 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 5 * SIZE(AO1, LDA3), %xmm3 + + + vmovups 4 * SIZE(AO2), %xmm4 + vmovups 5 * SIZE(AO2, LDA), %xmm5 + vmovups 4 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 5 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + + vmovups %xmm9, 16 * SIZE(B) + vmovups %xmm10, 18 * SIZE(B) + vmovups %xmm11, 20 * SIZE(B) + vmovups %xmm12, 22 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + + vmovups %xmm0, 24 * SIZE(B) + vmovups %xmm2, 26 * SIZE(B) + vmovups %xmm4, 28 * SIZE(B) + vmovups %xmm6, 30 * SIZE(B) + + + vmovups 6 * SIZE(AO1), %xmm0 + vmovups 7 * SIZE(AO1, LDA), %xmm9 + vmovups 6 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 7 * SIZE(AO1, LDA3), %xmm10 + + + vmovups 6 * SIZE(AO2), %xmm4 + vmovups 7 * SIZE(AO2, LDA), %xmm11 + vmovups 6 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 7 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + + + vmovups %xmm1, 32 * SIZE(B) + vmovups %xmm3, 34 * SIZE(B) + vmovups %xmm5, 36 * SIZE(B) + vmovups %xmm7, 38 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + + vmovups %xmm0, 40 * SIZE(B) + vmovups %xmm2, 42 * SIZE(B) + vmovups %xmm4, 44 * SIZE(B) + vmovups %xmm6, 46 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-64 * SIZE, B + + decq I + jg .L53 + ALIGN_4 + +.L54: + testq $4, MM + jle .L56 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 1 * SIZE(AO1, LDA3), %xmm3 + vmovups 0 * SIZE(AO2), %xmm4 + vmovups 1 * SIZE(AO2, LDA), %xmm5 + vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + vmovups %xmm9, -16 * SIZE(B) + vmovups %xmm10, -14 * SIZE(B) + vmovups %xmm11, -12 * SIZE(B) + vmovups %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + vmovups %xmm0, -8 * SIZE(B) + vmovups %xmm2, -6 * SIZE(B) + vmovups %xmm4, -4 * SIZE(B) + vmovups %xmm6, -2 * SIZE(B) + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 3 * SIZE(AO1, LDA), %xmm9 + vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 3 * SIZE(AO1, LDA3), %xmm10 + vmovups 2 * SIZE(AO2), %xmm4 + vmovups 3 * SIZE(AO2, LDA), %xmm11 + vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 3 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + + vmovups %xmm1, 0 * SIZE(B) + vmovups %xmm3, 2 * SIZE(B) + vmovups %xmm5, 4 * SIZE(B) + vmovups %xmm7, 6 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + + vmovups %xmm0, 8 * SIZE(B) + vmovups %xmm2, 10 * SIZE(B) + vmovups %xmm4, 12 * SIZE(B) + vmovups %xmm6, 14 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B + ALIGN_4 + +.L56: + testq $2, MM + jle .L58 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 + vmovups 1 * SIZE(AO1, LDA3), %xmm3 + vmovups 0 * SIZE(AO2), %xmm4 + vmovups 1 * SIZE(AO2, LDA), %xmm5 + vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 + vmovups 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + vmovups %xmm9, -16 * SIZE(B) + vmovups %xmm10, -14 * SIZE(B) + vmovups %xmm11, -12 * SIZE(B) + vmovups %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + vmovups %xmm0, -8 * SIZE(B) + vmovups %xmm2, -6 * SIZE(B) + vmovups %xmm4, -4 * SIZE(B) + vmovups %xmm6, -2 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + subq $-8 * SIZE, B + ALIGN_4 + +.L59: + decq J + jg .L51 + ALIGN_4 + +.L60: + testq $4, N + jle .L70 + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L62: + vmovups -1 * SIZE(AO1, LDA), %xmm5 + vmovups -1 * SIZE(AO2, LDA), %xmm7 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO2), %xmm2 + vmovups 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + + vmovups %xmm5, -16 * SIZE(B) + vmovups %xmm7, -14 * SIZE(B) + vmovups %xmm0, -12 * SIZE(B) + vmovups %xmm2, -10 * SIZE(B) + + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 3 * SIZE(AO1, LDA), %xmm5 + vmovups 2 * SIZE(AO2), %xmm2 + vmovups 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + + + vmovups %xmm1, -8 * SIZE(B) + vmovups %xmm3, -6 * SIZE(B) + vmovups %xmm0, -4 * SIZE(B) + vmovups %xmm2, -2 * SIZE(B) + + + vmovups 4 * SIZE(AO1), %xmm0 + vmovups 5 * SIZE(AO1, LDA), %xmm1 + vmovups 4 * SIZE(AO2), %xmm2 + vmovups 5 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + + vmovups %xmm5, 0 * SIZE(B) + vmovups %xmm7, 2 * SIZE(B) + vmovups %xmm0, 4 * SIZE(B) + vmovups %xmm2, 6 * SIZE(B) + + + vmovups 6 * SIZE(AO1), %xmm0 + vmovups 7 * SIZE(AO1, LDA), %xmm5 + vmovups 6 * SIZE(AO2), %xmm2 + vmovups 7 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + + + vmovups %xmm1, 8 * SIZE(B) + vmovups %xmm3, 10 * SIZE(B) + vmovups %xmm0, 12 * SIZE(B) + vmovups %xmm2, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO2), %xmm2 + vmovups 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm7 + shufpd $1, %xmm3, %xmm2 + + vmovups %xmm5, -16 * SIZE(B) + vmovups %xmm7, -14 * SIZE(B) + vmovups %xmm0, -12 * SIZE(B) + vmovups %xmm2, -10 * SIZE(B) + + vmovups 2 * SIZE(AO1), %xmm0 + vmovups 3 * SIZE(AO1, LDA), %xmm5 + vmovups 2 * SIZE(AO2), %xmm2 + vmovups 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + shufpd $1, %xmm5, %xmm0 + movsd %xmm2, %xmm3 + shufpd $1, %xmm7, %xmm2 + + vmovups %xmm1, -8 * SIZE(B) + vmovups %xmm3, -6 * SIZE(B) + vmovups %xmm0, -4 * SIZE(B) + vmovups %xmm2, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO1, LDA), %xmm1 + vmovups 0 * SIZE(AO2), %xmm2 + vmovups 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + vmovups %xmm5, -16 * SIZE(B) + vmovups %xmm7, -14 * SIZE(B) + vmovups %xmm0, -12 * SIZE(B) + vmovups %xmm2, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L70 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L70: + testq $2, N + jle .L80 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L72 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L72: + vmovups -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L74 + ALIGN_4 + +.L73: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO2), %xmm1 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + + vmovups %xmm5, -16 * SIZE(B) + vmovups %xmm0, -14 * SIZE(B) + vmovups %xmm1, -12 * SIZE(B) + vmovups %xmm2, -10 * SIZE(B) + + + vmovups 4 * SIZE(AO1), %xmm0 + vmovups 5 * SIZE(AO2), %xmm1 + vmovups 6 * SIZE(AO1), %xmm2 + vmovups 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + + + vmovups %xmm3, -8 * SIZE(B) + vmovups %xmm0, -6 * SIZE(B) + vmovups %xmm1, -4 * SIZE(B) + vmovups %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L73 + ALIGN_4 + +.L74: + testq $4, MM + jle .L76 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO2), %xmm1 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + vmovups %xmm5, -16 * SIZE(B) + vmovups %xmm0, -14 * SIZE(B) + vmovups %xmm1, -12 * SIZE(B) + vmovups %xmm2, -10 * SIZE(B) + + vmovups %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L76: + testq $2, MM + jle .L78 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + vmovups %xmm5, -16 * SIZE(B) + vmovups %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L78: + testq $1, MM + jle .L80 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L80: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L85 + + movq MM, I + sarq $3, I + jle .L82 + ALIGN_4 + +.L81: + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm2 + vmovups 4 * SIZE(AO1), %xmm4 + vmovups 6 * SIZE(AO1), %xmm6 + + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + vmovups %xmm4, -12 * SIZE(B) + vmovups %xmm6, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L81 + ALIGN_4 + +.L82: + testq $4, MM + jle .L83 + + vmovups 0 * SIZE(AO1), %xmm0 + vmovups 2 * SIZE(AO1), %xmm2 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm2, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L83: + testq $2, MM + jle .L84 + + vmovups 0 * SIZE(AO1), %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L84: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L85: + vmovups -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L86 + ALIGN_4 + +.L86: + + vmovups 1 * SIZE(AO1), %xmm1 + vmovups 3 * SIZE(AO1), %xmm2 + vmovups 5 * SIZE(AO1), %xmm3 + vmovups 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm1, -14 * SIZE(B) + vmovups %xmm2, -12 * SIZE(B) + vmovups %xmm3, -10 * SIZE(B) + + vmovups %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L86 + ALIGN_4 + +.L87: + testq $4, M + jle .L88 + + vmovups 1 * SIZE(AO1), %xmm1 + vmovups 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + vmovups %xmm0, -16 * SIZE(B) + vmovups %xmm1, -14 * SIZE(B) + + vmovups %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L88: + testq $2, M + jle .L89 + + vmovups 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + vmovups %xmm0, -16 * SIZE(B) + + vmovups %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L89: + testq $1, M + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + vmovups 0(%rsp), %xmm6 + vmovups 16(%rsp), %xmm7 + vmovups 32(%rsp), %xmm8 + vmovups 48(%rsp), %xmm9 + vmovups 64(%rsp), %xmm10 + vmovups 80(%rsp), %xmm11 + vmovups 96(%rsp), %xmm12 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE