Enable new build target platform -- COOPERLAKE. This target platform supports all the SKYLAKEX supported ISAs + avx512bf16. So all the SKYLAKEX specific kernels/drivers and related code are now extended to be also active on COOPERLAKE. Besides, new BF16 related kernels are active under this target.
1030 lines
20 KiB
ArmAsm
1030 lines
20 KiB
ArmAsm
/*********************************************************************/
|
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
/* All rights reserved. */
|
|
/* */
|
|
/* Redistribution and use in source and binary forms, with or */
|
|
/* without modification, are permitted provided that the following */
|
|
/* conditions are met: */
|
|
/* */
|
|
/* 1. Redistributions of source code must retain the above */
|
|
/* copyright notice, this list of conditions and the following */
|
|
/* disclaimer. */
|
|
/* */
|
|
/* 2. Redistributions in binary form must reproduce the above */
|
|
/* copyright notice, this list of conditions and the following */
|
|
/* disclaimer in the documentation and/or other materials */
|
|
/* provided with the distribution. */
|
|
/* */
|
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
/* */
|
|
/* The views and conclusions contained in the software and */
|
|
/* documentation are those of the authors and should not be */
|
|
/* interpreted as representing official policies, either expressed */
|
|
/* or implied, of The University of Texas at Austin. */
|
|
/*********************************************************************/
|
|
|
|
#define ASSEMBLER
|
|
#include "common.h"
|
|
|
|
#ifdef ATOM
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 12)
|
|
#endif
|
|
|
|
#ifdef CORE2
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 12)
|
|
#endif
|
|
|
|
#if defined(PENRYN) || defined(DUNNINGTON)
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 12)
|
|
#endif
|
|
|
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 12)
|
|
#endif
|
|
|
|
#ifdef PENTIUM4
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 20)
|
|
#endif
|
|
|
|
#ifdef OPTERON
|
|
#define PREFETCH prefetch
|
|
#define PREFETCHW prefetchw
|
|
#define PREFETCHSIZE (16 * 8)
|
|
#define movsd movlps
|
|
#endif
|
|
|
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
|
#define PREFETCH prefetch
|
|
#define PREFETCHW prefetchw
|
|
#define PREFETCHSIZE (16 * 16)
|
|
#endif
|
|
|
|
#ifdef NANO
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 24)
|
|
#endif
|
|
|
|
#ifdef GENERIC
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 20)
|
|
#endif
|
|
|
|
#ifndef WINDOWS_ABI
|
|
|
|
#define STACKSIZE 80
|
|
|
|
#define OLD_Y 8 + STACKSIZE(%rsp)
|
|
#define OLD_INCY 16 + STACKSIZE(%rsp)
|
|
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
|
|
|
|
#define M ARG1
|
|
#define N ARG2
|
|
#define A ARG3
|
|
#define LDA ARG4
|
|
#define X ARG5
|
|
#define INCX ARG6
|
|
|
|
#else
|
|
|
|
#define STACKSIZE 256
|
|
|
|
#define OLD_LDA 40 + STACKSIZE(%rsp)
|
|
#define OLD_X 48 + STACKSIZE(%rsp)
|
|
#define OLD_INCX 56 + STACKSIZE(%rsp)
|
|
#define OLD_Y 64 + STACKSIZE(%rsp)
|
|
#define OLD_INCY 72 + STACKSIZE(%rsp)
|
|
#define OLD_BUFFER 80 + STACKSIZE(%rsp)
|
|
|
|
#define M ARG1
|
|
#define N ARG2
|
|
#define A ARG4
|
|
#define LDA ARG3
|
|
#define X %rdi
|
|
#define INCX %rsi
|
|
#endif
|
|
|
|
#define Y %r10
|
|
#define INCY %r11
|
|
#define BUFFER %r12
|
|
|
|
#define TEMP %rax
|
|
#define I %rax
|
|
#define A1 %rbx
|
|
#define A2 %rbp
|
|
#define XX %r13
|
|
#define YY %r14
|
|
#define IS %r15
|
|
#define NEW_X BUFFER
|
|
#define NEW_Y X
|
|
|
|
#define ALPHA %xmm0
|
|
|
|
#define atemp1 %xmm0
|
|
#define atemp2 %xmm1
|
|
#define atemp3 %xmm2
|
|
#define atemp4 %xmm3
|
|
|
|
#define xsum1 %xmm4
|
|
#define xsum2 %xmm5
|
|
#define xsum3 %xmm6
|
|
#define xsum4 %xmm7
|
|
|
|
#define xtemp1 %xmm8
|
|
#define xtemp2 %xmm9
|
|
#define yy1 %xmm10
|
|
#define xt1 %xmm11
|
|
|
|
#define a1 %xmm12
|
|
#define a2 %xmm13
|
|
#define a3 %xmm14
|
|
#define a4 %xmm15
|
|
|
|
|
|
PROLOGUE
|
|
PROFCODE
|
|
|
|
subq $STACKSIZE, %rsp
|
|
movq %rbx, 0(%rsp)
|
|
movq %rbp, 8(%rsp)
|
|
movq %r12, 16(%rsp)
|
|
movq %r13, 24(%rsp)
|
|
movq %r14, 32(%rsp)
|
|
movq %r15, 40(%rsp)
|
|
|
|
#ifdef WINDOWS_ABI
|
|
movq %rdi, 48(%rsp)
|
|
movq %rsi, 56(%rsp)
|
|
movups %xmm6, 64(%rsp)
|
|
movups %xmm7, 80(%rsp)
|
|
movups %xmm8, 96(%rsp)
|
|
movups %xmm9, 112(%rsp)
|
|
movups %xmm10, 128(%rsp)
|
|
movups %xmm11, 144(%rsp)
|
|
movups %xmm12, 160(%rsp)
|
|
movups %xmm13, 176(%rsp)
|
|
movups %xmm14, 192(%rsp)
|
|
movups %xmm15, 208(%rsp)
|
|
|
|
movq OLD_LDA, LDA
|
|
movq OLD_X, X
|
|
movq OLD_INCX, INCX
|
|
|
|
movaps %xmm2, %xmm0
|
|
#endif
|
|
|
|
movq OLD_Y, Y
|
|
movq OLD_INCY, INCY
|
|
movq OLD_BUFFER, BUFFER
|
|
|
|
leaq (,INCX, SIZE), INCX
|
|
leaq (,INCY, SIZE), INCY
|
|
leaq (,LDA, SIZE), LDA
|
|
|
|
testq M, M
|
|
jle .L999
|
|
|
|
shufps $0, ALPHA, ALPHA
|
|
|
|
movq BUFFER, XX
|
|
|
|
movq M, %rax
|
|
sarq $3, %rax
|
|
jle .L02
|
|
ALIGN_3
|
|
|
|
.L01:
|
|
movss 0 * SIZE(X), %xmm1
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm2
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm3
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm4
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm5
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm6
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm7
|
|
addq INCX, X
|
|
movss 0 * SIZE(X), %xmm8
|
|
addq INCX, X
|
|
|
|
mulss ALPHA, %xmm1
|
|
mulss ALPHA, %xmm2
|
|
mulss ALPHA, %xmm3
|
|
mulss ALPHA, %xmm4
|
|
mulss ALPHA, %xmm5
|
|
mulss ALPHA, %xmm6
|
|
mulss ALPHA, %xmm7
|
|
mulss ALPHA, %xmm8
|
|
|
|
movss %xmm1, 0 * SIZE(XX)
|
|
movss %xmm2, 1 * SIZE(XX)
|
|
movss %xmm3, 2 * SIZE(XX)
|
|
movss %xmm4, 3 * SIZE(XX)
|
|
movss %xmm5, 4 * SIZE(XX)
|
|
movss %xmm6, 5 * SIZE(XX)
|
|
movss %xmm7, 6 * SIZE(XX)
|
|
movss %xmm8, 7 * SIZE(XX)
|
|
|
|
addq $8 * SIZE, XX
|
|
decq %rax
|
|
jg .L01
|
|
ALIGN_3
|
|
|
|
.L02:
|
|
movq M, %rax
|
|
andq $7, %rax
|
|
jle .L05
|
|
ALIGN_3
|
|
|
|
.L03:
|
|
movss 0 * SIZE(X), %xmm1
|
|
addq INCX, X
|
|
|
|
mulss ALPHA, %xmm1
|
|
|
|
movss %xmm1, 0 * SIZE(XX)
|
|
|
|
addq $1 * SIZE, XX
|
|
decq %rax
|
|
jg .L03
|
|
ALIGN_3
|
|
|
|
.L05:
|
|
/* now we don't need original X */
|
|
movq Y, NEW_Y
|
|
|
|
addq $512, XX
|
|
andq $-512, XX
|
|
|
|
cmpq $SIZE, INCY
|
|
je .L10
|
|
|
|
movq Y, YY
|
|
movq XX, NEW_Y
|
|
|
|
movq M, %rax
|
|
sarq $3, %rax
|
|
jle .L07
|
|
ALIGN_3
|
|
|
|
.L06:
|
|
movss 0 * SIZE(YY), %xmm0
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm1
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm2
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm3
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm4
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm5
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm6
|
|
addq INCY, YY
|
|
movss 0 * SIZE(YY), %xmm7
|
|
addq INCY, YY
|
|
|
|
movss %xmm0, 0 * SIZE(XX)
|
|
movss %xmm1, 1 * SIZE(XX)
|
|
movss %xmm2, 2 * SIZE(XX)
|
|
movss %xmm3, 3 * SIZE(XX)
|
|
movss %xmm4, 4 * SIZE(XX)
|
|
movss %xmm5, 5 * SIZE(XX)
|
|
movss %xmm6, 6 * SIZE(XX)
|
|
movss %xmm7, 7 * SIZE(XX)
|
|
|
|
addq $8 * SIZE, XX
|
|
decq %rax
|
|
jg .L06
|
|
ALIGN_3
|
|
|
|
.L07:
|
|
movq M, %rax
|
|
andq $7, %rax
|
|
jle .L10
|
|
ALIGN_3
|
|
|
|
.L08:
|
|
movss 0 * SIZE(YY), %xmm0
|
|
addq INCY, YY
|
|
|
|
movss %xmm0, 0 * SIZE(XX)
|
|
|
|
addq $1 * SIZE, XX
|
|
decq %rax
|
|
jg .L08
|
|
ALIGN_3
|
|
|
|
.L10:
|
|
xorq IS, IS # is = 0
|
|
|
|
cmpq $4, N
|
|
jl .L20
|
|
ALIGN_3
|
|
|
|
.L11:
|
|
movq A, A1
|
|
leaq (A, LDA, 2), A2
|
|
leaq 4 * SIZE(A, LDA, 4), A
|
|
|
|
leaq (NEW_X, IS, SIZE), XX
|
|
leaq 4 * SIZE(NEW_Y, IS, SIZE), YY
|
|
|
|
movaps 0 * SIZE(XX), atemp4
|
|
|
|
movsd 0 * SIZE(A1), xsum1
|
|
movhps 2 * SIZE(A1), xsum1
|
|
mulps atemp4, xsum1
|
|
|
|
movss 1 * SIZE(A1), xsum2
|
|
movss 1 * SIZE(A1, LDA, 1), a2
|
|
movss 2 * SIZE(A1, LDA, 1), a3
|
|
movss 3 * SIZE(A1, LDA, 1), a4
|
|
unpcklps a3, xsum2
|
|
unpcklps a4, a2
|
|
unpcklps a2, xsum2
|
|
mulps atemp4, xsum2
|
|
|
|
movss 2 * SIZE(A1), xsum3
|
|
movss 2 * SIZE(A1, LDA, 1), a2
|
|
movss 2 * SIZE(A2), a3
|
|
movss 3 * SIZE(A2), a4
|
|
unpcklps a3, xsum3
|
|
unpcklps a4, a2
|
|
unpcklps a2, xsum3
|
|
mulps atemp4, xsum3
|
|
|
|
movss 3 * SIZE(A1), xsum4
|
|
movss 3 * SIZE(A1, LDA, 1), a2
|
|
movss 3 * SIZE(A2), a3
|
|
movss 3 * SIZE(A2, LDA, 1), a4
|
|
unpcklps a3, xsum4
|
|
unpcklps a4, a2
|
|
unpcklps a2, xsum4
|
|
mulps atemp4, xsum4
|
|
|
|
pshufd $0x00, atemp4, atemp1
|
|
pshufd $0x55, atemp4, atemp2
|
|
pshufd $0xaa, atemp4, atemp3
|
|
pshufd $0xff, atemp4, atemp4
|
|
|
|
movaps 4 * SIZE(XX), xtemp1
|
|
movaps 8 * SIZE(XX), xtemp2
|
|
|
|
movsd 0 * SIZE(YY), yy1
|
|
movhps 2 * SIZE(YY), yy1
|
|
|
|
movsd 4 * SIZE(A1), a1
|
|
movhps 6 * SIZE(A1), a1
|
|
movsd 4 * SIZE(A1, LDA, 1), a2
|
|
movhps 6 * SIZE(A1, LDA, 1), a2
|
|
movsd 4 * SIZE(A2), a3
|
|
movhps 6 * SIZE(A2), a3
|
|
movsd 4 * SIZE(A2, LDA, 1), a4
|
|
movhps 6 * SIZE(A2, LDA, 1), a4
|
|
|
|
addq $4 * SIZE, XX
|
|
addq $4 * SIZE, A1
|
|
addq $4 * SIZE, A2
|
|
|
|
movq M, I
|
|
subq IS, I
|
|
subq $4, I
|
|
sarq $4, I
|
|
jle .L14
|
|
ALIGN_3
|
|
|
|
.L12:
|
|
movaps xtemp1, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 4 * SIZE(A1), a1
|
|
movhps 6 * SIZE(A1), a1
|
|
|
|
PREFETCH PREFETCHSIZE(A1)
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 4 * SIZE(A1, LDA, 1), a2
|
|
movhps 6 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 4 * SIZE(A2), a3
|
|
movhps 6 * SIZE(A2), a3
|
|
|
|
#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
|
|
PREFETCH PREFETCHSIZE(XX)
|
|
#endif
|
|
|
|
movaps xtemp1, xt1
|
|
movaps 8 * SIZE(XX), xtemp1
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 4 * SIZE(A2, LDA, 1), a4
|
|
movhps 6 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 0 * SIZE(YY)
|
|
movhps yy1, 2 * SIZE(YY)
|
|
movsd 4 * SIZE(YY), yy1
|
|
movhps 6 * SIZE(YY), yy1
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 8 * SIZE(A1), a1
|
|
movhps 10 * SIZE(A1), a1
|
|
|
|
PREFETCH PREFETCHSIZE(A1, LDA, 1)
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 8 * SIZE(A1, LDA, 1), a2
|
|
movhps 10 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 8 * SIZE(A2), a3
|
|
movhps 10 * SIZE(A2), a3
|
|
|
|
movaps xtemp2, xt1
|
|
movaps 12 * SIZE(XX), xtemp2
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 8 * SIZE(A2, LDA, 1), a4
|
|
movhps 10 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 4 * SIZE(YY)
|
|
movhps yy1, 6 * SIZE(YY)
|
|
movsd 8 * SIZE(YY), yy1
|
|
movhps 10 * SIZE(YY), yy1
|
|
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 12 * SIZE(A1), a1
|
|
movhps 14 * SIZE(A1), a1
|
|
|
|
PREFETCH PREFETCHSIZE(A2)
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 12 * SIZE(A1, LDA, 1), a2
|
|
movhps 14 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 12 * SIZE(A2), a3
|
|
movhps 14 * SIZE(A2), a3
|
|
|
|
#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
|
|
PREFETCHW PREFETCHSIZE(YY)
|
|
#endif
|
|
|
|
movaps xtemp1, xt1
|
|
movaps 16 * SIZE(XX), xtemp1
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 12 * SIZE(A2, LDA, 1), a4
|
|
movhps 14 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 8 * SIZE(YY)
|
|
movhps yy1, 10 * SIZE(YY)
|
|
movsd 12 * SIZE(YY), yy1
|
|
movhps 14 * SIZE(YY), yy1
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 16 * SIZE(A1), a1
|
|
movhps 18 * SIZE(A1), a1
|
|
|
|
PREFETCH PREFETCHSIZE(A2, LDA, 1)
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 16 * SIZE(A1, LDA, 1), a2
|
|
movhps 18 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 16 * SIZE(A2), a3
|
|
movhps 18 * SIZE(A2), a3
|
|
|
|
movaps xtemp2, xt1
|
|
movaps 20 * SIZE(XX), xtemp2
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 16 * SIZE(A2, LDA, 1), a4
|
|
movhps 18 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 12 * SIZE(YY)
|
|
movhps yy1, 14 * SIZE(YY)
|
|
movsd 16 * SIZE(YY), yy1
|
|
movhps 18 * SIZE(YY), yy1
|
|
|
|
addq $16 * SIZE, XX
|
|
addq $16 * SIZE, YY
|
|
addq $16 * SIZE, A1
|
|
addq $16 * SIZE, A2
|
|
|
|
decq I
|
|
jg .L12
|
|
ALIGN_3
|
|
|
|
.L14:
|
|
movq M, I
|
|
subq IS, I
|
|
subq $4, I
|
|
test $8, I
|
|
jle .L15
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 4 * SIZE(A1), a1
|
|
movhps 6 * SIZE(A1), a1
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 4 * SIZE(A1, LDA, 1), a2
|
|
movhps 6 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 4 * SIZE(A2), a3
|
|
movhps 6 * SIZE(A2), a3
|
|
|
|
movaps xtemp1, xt1
|
|
movaps 8 * SIZE(XX), xtemp1
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 4 * SIZE(A2, LDA, 1), a4
|
|
movhps 6 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 0 * SIZE(YY)
|
|
movhps yy1, 2 * SIZE(YY)
|
|
movsd 4 * SIZE(YY), yy1
|
|
movhps 6 * SIZE(YY), yy1
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 8 * SIZE(A1), a1
|
|
movhps 10 * SIZE(A1), a1
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 8 * SIZE(A1, LDA, 1), a2
|
|
movhps 10 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp2, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 8 * SIZE(A2), a3
|
|
movhps 10 * SIZE(A2), a3
|
|
|
|
movaps xtemp2, xt1
|
|
movaps 12 * SIZE(XX), xtemp2
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 8 * SIZE(A2, LDA, 1), a4
|
|
movhps 10 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 4 * SIZE(YY)
|
|
movhps yy1, 6 * SIZE(YY)
|
|
movsd 8 * SIZE(YY), yy1
|
|
movhps 10 * SIZE(YY), yy1
|
|
|
|
addq $8 * SIZE, XX
|
|
addq $8 * SIZE, YY
|
|
addq $8 * SIZE, A1
|
|
addq $8 * SIZE, A2
|
|
ALIGN_3
|
|
|
|
.L15:
|
|
test $4, I
|
|
jle .L17
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movsd 4 * SIZE(A1), a1
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movsd 4 * SIZE(A1, LDA, 1), a2
|
|
|
|
movaps xtemp1, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movsd 4 * SIZE(A2), a3
|
|
|
|
movaps xtemp1, xt1
|
|
movsd 4 * SIZE(XX), xtemp1
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movsd 4 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 0 * SIZE(YY)
|
|
movhps yy1, 2 * SIZE(YY)
|
|
movsd 4 * SIZE(YY), yy1
|
|
|
|
addq $4 * SIZE, XX
|
|
addq $4 * SIZE, YY
|
|
addq $4 * SIZE, A1
|
|
addq $4 * SIZE, A2
|
|
ALIGN_3
|
|
|
|
.L17:
|
|
testq $2, M
|
|
jle .L18
|
|
|
|
pxor xtemp2, xtemp2
|
|
|
|
movlhps xtemp2, a1
|
|
movaps xtemp1, xt1
|
|
mulps a1, xt1
|
|
mulps atemp1, a1
|
|
addps xt1, xsum1
|
|
addps a1, yy1
|
|
movss 2 * SIZE(A1), a1
|
|
|
|
movlhps xtemp2, a2
|
|
movaps xtemp1, xt1
|
|
mulps a2, xt1
|
|
mulps atemp2, a2
|
|
addps xt1, xsum2
|
|
addps a2, yy1
|
|
movss 2 * SIZE(A1, LDA, 1), a2
|
|
|
|
movlhps xtemp2, a3
|
|
movaps xtemp1, xt1
|
|
mulps a3, xt1
|
|
mulps atemp3, a3
|
|
addps xt1, xsum3
|
|
addps a3, yy1
|
|
movss 2 * SIZE(A2), a3
|
|
|
|
movlhps xtemp2, a4
|
|
movaps xtemp1, xt1
|
|
movss 2 * SIZE(XX), xtemp1
|
|
mulps a4, xt1
|
|
mulps atemp4, a4
|
|
addps xt1, xsum4
|
|
addps a4, yy1
|
|
movss 2 * SIZE(A2, LDA, 1), a4
|
|
|
|
movlps yy1, 0 * SIZE(YY)
|
|
movss 2 * SIZE(YY), yy1
|
|
|
|
addq $2 * SIZE, XX
|
|
addq $2 * SIZE, YY
|
|
addq $2 * SIZE, A1
|
|
addq $2 * SIZE, A2
|
|
ALIGN_3
|
|
|
|
.L18:
|
|
testq $1, M
|
|
jle .L19
|
|
|
|
movss 0 * SIZE(XX), xtemp1
|
|
|
|
movss 0 * SIZE(YY), yy1
|
|
|
|
movss 0 * SIZE(A1), a1
|
|
movss 0 * SIZE(A1, LDA, 1), a2
|
|
movss 0 * SIZE(A2), a3
|
|
movss 0 * SIZE(A2, LDA, 1), a4
|
|
|
|
movaps xtemp1, xt1
|
|
mulss a1, xt1
|
|
mulss atemp1, a1
|
|
addss xt1, xsum1
|
|
addss a1, yy1
|
|
|
|
movaps xtemp1, xt1
|
|
mulss a2, xt1
|
|
mulss atemp2, a2
|
|
addss xt1, xsum2
|
|
addss a2, yy1
|
|
|
|
movaps xtemp1, xt1
|
|
mulss a3, xt1
|
|
mulss atemp3, a3
|
|
addss xt1, xsum3
|
|
addss a3, yy1
|
|
|
|
movaps xtemp1, xt1
|
|
mulss a4, xt1
|
|
mulss atemp4, a4
|
|
addss xt1, xsum4
|
|
addss a4, yy1
|
|
|
|
movss yy1, 0 * SIZE(YY)
|
|
ALIGN_3
|
|
|
|
.L19:
|
|
#ifndef HAVE_SSE3
|
|
movaps xsum1, xtemp1
|
|
unpcklps xsum3, xsum1
|
|
unpckhps xsum3, xtemp1
|
|
|
|
movaps xsum2, xtemp2
|
|
unpcklps xsum4, xsum2
|
|
unpckhps xsum4, xtemp2
|
|
|
|
movaps xsum1, xsum3
|
|
unpcklps xsum2, xsum1
|
|
unpckhps xsum2, xsum3
|
|
|
|
movaps xtemp1, xsum4
|
|
unpcklps xtemp2, xtemp1
|
|
unpckhps xtemp2, xsum4
|
|
|
|
addps xsum3, xsum1
|
|
addps xtemp1, xsum4
|
|
addps xsum4, xsum1
|
|
#else
|
|
haddps xsum2, xsum1
|
|
haddps xsum4, xsum3
|
|
|
|
haddps xsum3, xsum1
|
|
#endif
|
|
|
|
movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
|
|
movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1
|
|
|
|
addps xsum1, yy1
|
|
|
|
movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
|
|
movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE)
|
|
|
|
addq $4, IS
|
|
|
|
movq IS, I
|
|
addq $4, I
|
|
cmpq N, I
|
|
jle .L11
|
|
ALIGN_3
|
|
|
|
.L20:
|
|
testq $2, N
|
|
jle .L30
|
|
|
|
movq A, A1
|
|
leaq 2 * SIZE(A, LDA, 2), A
|
|
|
|
movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4
|
|
|
|
#if defined(OPTERON)
|
|
pxor xsum1, xsum1
|
|
#endif
|
|
movsd 0 * SIZE(A1), xsum1
|
|
mulps atemp4, xsum1
|
|
|
|
movss 1 * SIZE(A1), xsum2
|
|
movss 1 * SIZE(A1, LDA, 1), a2
|
|
unpcklps a2, xsum2
|
|
mulps atemp4, xsum2
|
|
|
|
pshufd $0x00, atemp4, atemp1
|
|
pshufd $0x55, atemp4, atemp2
|
|
|
|
testq $1, M
|
|
jle .L29
|
|
|
|
movss 2 * SIZE(A1), a1
|
|
movss 2 * SIZE(A1, LDA, 1), a2
|
|
movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1
|
|
movss 2 * SIZE(NEW_Y, IS, SIZE), yy1
|
|
|
|
movaps xtemp1, xt1
|
|
mulss a1, xt1
|
|
mulss atemp1, a1
|
|
addss xt1, xsum1
|
|
addps a1, yy1
|
|
|
|
movaps xtemp1, xt1
|
|
mulss a2, xt1
|
|
mulss atemp2, a2
|
|
addss xt1, xsum2
|
|
addss a2, yy1
|
|
|
|
movss yy1, 2 * SIZE(NEW_Y, IS, SIZE)
|
|
ALIGN_3
|
|
|
|
.L29:
|
|
|
|
#ifndef HAVE_SSE3
|
|
unpcklps xsum2, xsum1
|
|
movhlps xsum1, xsum2
|
|
addps xsum2, xsum1
|
|
#else
|
|
haddps xsum2, xsum1
|
|
haddps xsum1, xsum1
|
|
#endif
|
|
|
|
movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
|
|
|
|
addps xsum1, yy1
|
|
|
|
movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE)
|
|
|
|
addq $2, IS
|
|
ALIGN_3
|
|
|
|
.L30:
|
|
testq $1, N
|
|
jle .L990
|
|
|
|
movss 0 * SIZE(NEW_X, IS, SIZE), xsum1
|
|
mulss 0 * SIZE(A), xsum1
|
|
addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1
|
|
movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE)
|
|
ALIGN_3
|
|
|
|
.L990:
|
|
cmpq $SIZE, INCY
|
|
je .L999
|
|
|
|
movq M, %rax
|
|
sarq $3, %rax
|
|
jle .L997
|
|
ALIGN_3
|
|
|
|
.L996:
|
|
movss 0 * SIZE(NEW_Y), %xmm0
|
|
movss 1 * SIZE(NEW_Y), %xmm1
|
|
movss 2 * SIZE(NEW_Y), %xmm2
|
|
movss 3 * SIZE(NEW_Y), %xmm3
|
|
movss 4 * SIZE(NEW_Y), %xmm4
|
|
movss 5 * SIZE(NEW_Y), %xmm5
|
|
movss 6 * SIZE(NEW_Y), %xmm6
|
|
movss 7 * SIZE(NEW_Y), %xmm7
|
|
|
|
movss %xmm0, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm1, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm2, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm3, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm4, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm5, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm6, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
movss %xmm7, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
|
|
addq $8 * SIZE, NEW_Y
|
|
decq %rax
|
|
jg .L996
|
|
ALIGN_3
|
|
|
|
.L997:
|
|
movq M, %rax
|
|
andq $7, %rax
|
|
jle .L999
|
|
ALIGN_3
|
|
|
|
.L998:
|
|
movss 0 * SIZE(NEW_Y), %xmm0
|
|
|
|
movss %xmm0, 0 * SIZE(Y)
|
|
addq INCY, Y
|
|
|
|
addq $1 * SIZE, NEW_Y
|
|
|
|
decq %rax
|
|
jg .L998
|
|
ALIGN_3
|
|
|
|
.L999:
|
|
movq 0(%rsp), %rbx
|
|
movq 8(%rsp), %rbp
|
|
movq 16(%rsp), %r12
|
|
movq 24(%rsp), %r13
|
|
movq 32(%rsp), %r14
|
|
movq 40(%rsp), %r15
|
|
|
|
#ifdef WINDOWS_ABI
|
|
movq 48(%rsp), %rdi
|
|
movq 56(%rsp), %rsi
|
|
movups 64(%rsp), %xmm6
|
|
movups 80(%rsp), %xmm7
|
|
movups 96(%rsp), %xmm8
|
|
movups 112(%rsp), %xmm9
|
|
movups 128(%rsp), %xmm10
|
|
movups 144(%rsp), %xmm11
|
|
movups 160(%rsp), %xmm12
|
|
movups 176(%rsp), %xmm13
|
|
movups 192(%rsp), %xmm14
|
|
movups 208(%rsp), %xmm15
|
|
#endif
|
|
|
|
addq $STACKSIZE, %rsp
|
|
ret
|
|
EPILOGUE
|