Merge pull request #233 from wernsaar/develop

added dgemv_n and some faster gemm_copy routines to BULLDOZER.
This commit is contained in:
Zhang Xianyi 2013-06-18 20:02:36 -07:00
commit 646e168d26
7 changed files with 5558 additions and 6 deletions

View File

@ -1,20 +1,22 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,667 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS
#define A_PRE 256
#ifndef WINDOWS_ABI
#define N ARG1 /* rsi */
#define M ARG2 /* rdi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */
#define AO1 %r9
#define AO2 %r10
#define LDA3 %r11
#define M8 %r12
#else
#define N ARG1 /* rdx */
#define M ARG2 /* rcx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 56(%rsp)
#define B %r12
#define AO1 %rsi
#define AO2 %rdi
#define LDA3 %r10
#define M8 %r11
#endif
#define I %rax
#define B0 %rbp
#define B1 %r13
#define B2 %r14
#define B3 %r15
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
#ifdef WINDOWS_ABI
movq OLD_B, B
#endif
subq $-16 * SIZE, B
movq M, B1
movq M, B2
movq M, B3
andq $-8, B1
andq $-4, B2
andq $-2, B3
imulq N, B1
imulq N, B2
imulq N, B3
leaq (B, B1, SIZE), B1
leaq (B, B2, SIZE), B2
leaq (B, B3, SIZE), B3
leaq (,LDA, SIZE), LDA
leaq (LDA, LDA, 2), LDA3
leaq (, N, SIZE), M8
cmpq $8, N
jl .L20
ALIGN_4
.L11:
subq $8, N
movq A, AO1
leaq (A, LDA, 4), AO2
leaq (A, LDA, 8), A
movq B, B0
addq $64 * SIZE, B
movq M, I
sarq $3, I
jle .L14
ALIGN_4
.L13:
prefetchnta A_PRE(AO1)
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
prefetchnta A_PRE(AO1, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)
prefetchnta A_PRE(AO1, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)
vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)
prefetchnta A_PRE(AO1, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)
vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)
prefetchnta A_PRE(AO2)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
vmovups %xmm0, 16 * SIZE(B0)
vmovups %xmm1, 18 * SIZE(B0)
vmovups %xmm2, 20 * SIZE(B0)
vmovups %xmm3, 22 * SIZE(B0)
prefetchnta A_PRE(AO2, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, 24 * SIZE(B0)
vmovups %xmm1, 26 * SIZE(B0)
vmovups %xmm2, 28 * SIZE(B0)
vmovups %xmm3, 30 * SIZE(B0)
prefetchnta A_PRE(AO2, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)
vmovups %xmm0, 32 * SIZE(B0)
vmovups %xmm1, 34 * SIZE(B0)
vmovups %xmm2, 36 * SIZE(B0)
vmovups %xmm3, 38 * SIZE(B0)
prefetchnta A_PRE(AO2, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)
vmovups %xmm0, 40 * SIZE(B0)
vmovups %xmm1, 42 * SIZE(B0)
vmovups %xmm2, 44 * SIZE(B0)
vmovups %xmm3, 46 * SIZE(B0)
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0
decq I
jg .L13
ALIGN_4
.L14:
testq $4, M
jle .L16
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, 0 * SIZE(B1)
vmovups %xmm1, 2 * SIZE(B1)
vmovups %xmm2, 4 * SIZE(B1)
vmovups %xmm3, 6 * SIZE(B1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)
vmovups %xmm0, 8 * SIZE(B1)
vmovups %xmm1, 10 * SIZE(B1)
vmovups %xmm2, 12 * SIZE(B1)
vmovups %xmm3, 14 * SIZE(B1)
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-32 * SIZE, B1
ALIGN_4
.L16:
testq $2, M
jle .L18
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B2)
vmovups %xmm1, -6 * SIZE(B2)
vmovups %xmm2, -4 * SIZE(B2)
vmovups %xmm3, -2 * SIZE(B2)
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-16 * SIZE, B2
ALIGN_4
.L18:
testq $1, M
jle .L19
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO1, LDA3), %xmm3
vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2
vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)
vmovsd 0 * SIZE(AO2), %xmm0
vmovsd 0 * SIZE(AO2, LDA), %xmm1
vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO2, LDA3), %xmm3
vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2
vmovups %xmm0, -12 * SIZE(B3)
vmovups %xmm2, -10 * SIZE(B3)
subq $-8 * SIZE, B3
ALIGN_4
.L19:
cmpq $8, N
jge .L11
ALIGN_4
.L20:
cmpq $4, N
jl .L30
subq $4, N
movq A, AO1
leaq (A, LDA, 2), AO2
leaq (A, LDA, 4), A
movq B, B0
addq $32 * SIZE, B
movq M, I
sarq $3, I
jle .L24
ALIGN_4
.L23:
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0
decq I
jg .L23
ALIGN_4
.L24:
testq $4, M
jle .L26
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-16 * SIZE, B1
ALIGN_4
.L26:
testq $2, M
jle .L28
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-8 * SIZE, B2
ALIGN_4
.L28:
testq $1, M
jle .L30
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO2), %xmm2
vmovsd 0 * SIZE(AO2, LDA), %xmm3
vunpcklpd %xmm1, %xmm0, %xmm0
vunpcklpd %xmm3, %xmm2, %xmm2
vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)
subq $-4 * SIZE, B3
ALIGN_4
.L30:
cmpq $2, N
jl .L40
subq $2, N
movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A
movq B, B0
addq $16 * SIZE, B
movq M, I
sarq $3, I
jle .L34
ALIGN_4
.L33:
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0
decq I
jg .L33
ALIGN_4
.L34:
testq $4, M
jle .L36
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A1(2 * SIZE, AO2, %xmm3)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B1
ALIGN_4
.L36:
testq $2, M
jle .L38
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm1)
vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-4 * SIZE, B2
ALIGN_4
.L38:
testq $1, M
jle .L40
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1
vunpcklpd %xmm1, %xmm0, %xmm0
vmovups %xmm0, -16 * SIZE(B3)
subq $-2 * SIZE, B3
ALIGN_4
.L40:
cmpq $1, N
jl .L999
movq A, AO1
movq B, B0
movq M, I
sarq $3, I
jle .L44
ALIGN_4
.L43:
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
addq $8 * SIZE, AO1
leaq (B0, M8, 8), B0
decq I
jg .L43
ALIGN_4
.L44:
testq $4, M
jle .L45
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
addq $4 * SIZE, AO1
subq $-4 * SIZE, B1
ALIGN_4
.L45:
testq $2, M
jle .L46
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
vmovups %xmm0, -16 * SIZE(B2)
addq $2 * SIZE, AO1
subq $-2 * SIZE, B2
ALIGN_4
.L46:
testq $1, M
jle .L999
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
.L999:
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,360 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef WINDOWS_ABI
#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */
#define I %r9
#else
#define STACKSIZE 256
#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 32 + STACKSIZE(%rsp)
#define B %r14
#define I %r15
#endif
#define J %r10
#define AO1 %r11
#define AO2 %r12
#define AO3 %r13
#define AO4 %rax
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
#endif
pushq %r13
pushq %r12
#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp
vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)
movq OLD_B, B
#endif
leaq (,LDA, SIZE), LDA # Scaling
movq N, J
sarq $1, J
jle .L20
ALIGN_4
.L01:
movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A
movq M, I
sarq $3, I
jle .L08
ALIGN_4
.L03:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7
vmovss 4 * SIZE(AO1), %xmm8
vmovss 4 * SIZE(AO2), %xmm9
vmovss 5 * SIZE(AO1), %xmm10
vmovss 5 * SIZE(AO2), %xmm11
vmovss 6 * SIZE(AO1), %xmm12
vmovss 6 * SIZE(AO2), %xmm13
vmovss 7 * SIZE(AO1), %xmm14
vmovss 7 * SIZE(AO2), %xmm15
vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)
vmovss %xmm8, 8 * SIZE(B)
vmovss %xmm9, 9 * SIZE(B)
vmovss %xmm10, 10 * SIZE(B)
vmovss %xmm11, 11 * SIZE(B)
vmovss %xmm12, 12 * SIZE(B)
vmovss %xmm13, 13 * SIZE(B)
vmovss %xmm14, 14 * SIZE(B)
vmovss %xmm15, 15 * SIZE(B)
#else
prefetchw 256(B)
prefetchnta 256(AO1)
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 5 * SIZE(AO1), %xmm5
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 7 * SIZE(AO1), %xmm7
prefetchnta 256(AO2)
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4
vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5
vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6
vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7
prefetchw 256+64(B)
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
vmovups %xmm4, 8 * SIZE(B)
vmovups %xmm5, 10 * SIZE(B)
vmovups %xmm6, 12 * SIZE(B)
vmovups %xmm7, 14 * SIZE(B)
#endif
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
subq $-16 * SIZE, B
decq I
jg .L03
ALIGN_4
.L08:
testq $4 , M
je .L14
ALIGN_4
.L13:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7
vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
#endif
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B
ALIGN_4
.L14:
movq M, I
andq $3, I
jle .L16
ALIGN_4
.L15:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovups %xmm0, 0 * SIZE(B)
#endif
addq $SIZE, AO1
addq $SIZE, AO2
addq $2 * SIZE, B
decq I
jg .L15
ALIGN_4
.L16:
decq J
jg .L01
ALIGN_4
.L20:
testq $1, N
jle .L999
movq A, AO1
movq M, I
sarq $2, I
jle .L34
ALIGN_4
.L33:
#ifndef DOUBLE
vmovups 0 * SIZE(AO1), %xmm0
vmovups %xmm0, 0 * SIZE(B)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm1
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
#endif
addq $4 * SIZE, AO1
subq $-4 * SIZE, B
decq I
jg .L33
ALIGN_4
.L34:
movq M, I
andq $3, I
jle .L999
ALIGN_4
.L35:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(B)
#endif
addq $SIZE, AO1
addq $1 * SIZE, B
decq I
jg .L35
ALIGN_4
.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15
addq $STACKSIZE, %rsp
#endif
popq %r12
popq %r13
#ifdef WINDOWS_ABI
popq %r14
popq %r15
#endif
ret
EPILOGUE

View File

@ -0,0 +1,374 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef WINDOWS_ABI
#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */
#define I %r10
#define J %rbp
#define AO1 %r9
#define AO2 %r15
#define AO3 %r11
#define AO4 %r14
#define BO1 %r13
#define M8 %rbx
#define BO %rax
#else
#define STACKSIZE 256
#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 64 + STACKSIZE(%rsp)
#define B %rdi
#define I %r10
#define J %r11
#define AO1 %r12
#define AO2 %r13
#define AO3 %r14
#define AO4 %r15
#define BO1 %rsi
#define M8 %rbp
#define BO %rax
#endif
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx
#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp
vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)
movq OLD_B, B
#endif
movq N, %rax
andq $-2, %rax
imulq M, %rax
leaq (B, %rax, SIZE), BO1
leaq (, LDA, SIZE), LDA
leaq (, M, SIZE), M8
movq M, J
sarq $1, J
jle .L20
ALIGN_4
.L01:
movq A, AO1
leaq (A, LDA ), AO2
leaq (A, LDA, 2), A
movq B, BO
addq $4 * SIZE, B
movq N, I
sarq $3, I
jle .L10
ALIGN_4
.L08:
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3
vmovsd 4 * SIZE(AO2), %xmm5
vmovsd 6 * SIZE(AO2), %xmm7
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm4, 0 * SIZE(BO)
vmovsd %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm6, 0 * SIZE(BO)
vmovsd %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#else
prefetchnta 256(AO1)
prefetchnta 256(AO2)
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 4 * SIZE(AO1), %xmm4
vmovups 6 * SIZE(AO1), %xmm6
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3
vmovups 4 * SIZE(AO2), %xmm5
vmovups 6 * SIZE(AO2), %xmm7
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm4, 0 * SIZE(BO)
vmovups %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm6, 0 * SIZE(BO)
vmovups %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#endif
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
decq I
jg .L08
ALIGN_4
.L10:
testq $4, N
jle .L12
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#endif
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
ALIGN_4
.L12:
testq $2, N
jle .L14
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 0 * SIZE(AO2), %xmm1
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
#endif
leaq (BO, M8, 2), BO
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
ALIGN_4
.L14:
testq $1, N
jle .L19
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss %xmm0, 0 * SIZE(BO1)
vmovss %xmm1, 1 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovups %xmm0, 0 * SIZE(BO1)
#endif
addq $2 * SIZE, BO1
ALIGN_4
.L19:
decq J
jg .L01
ALIGN_4
.L20:
testq $1, M
jle .L999
ALIGN_4
.L31:
movq A, AO1
movq B, BO
movq N, I
sarq $1, I
jle .L33
ALIGN_4
.L32:
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups %xmm0, 0 * SIZE(BO)
#endif
addq $2 * SIZE, AO1
leaq (BO, M8, 2), BO
decq I
jg .L32
ALIGN_4
.L33:
testq $1, N
jle .L999
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO1)
#endif
addq $1 * SIZE, BO1
ALIGN_4
.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15
addq $STACKSIZE, %rsp
#endif
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif
ret
EPILOGUE

View File

@ -187,6 +187,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 4
#define ZGEMM3M_DEFAULT_UNROLL_M 4
#define GEMV_UNROLL 8
#endif
#if 0