Enable new build target platform -- COOPERLAKE. This target platform supports all the SKYLAKEX supported ISAs + avx512bf16. So all the SKYLAKEX specific kernels/drivers and related code are now extended to be also active on COOPERLAKE. Besides, new BF16 related kernels are active under this target.
815 lines
16 KiB
ArmAsm
815 lines
16 KiB
ArmAsm
/*********************************************************************/
|
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
/* All rights reserved. */
|
|
/* */
|
|
/* Redistribution and use in source and binary forms, with or */
|
|
/* without modification, are permitted provided that the following */
|
|
/* conditions are met: */
|
|
/* */
|
|
/* 1. Redistributions of source code must retain the above */
|
|
/* copyright notice, this list of conditions and the following */
|
|
/* disclaimer. */
|
|
/* */
|
|
/* 2. Redistributions in binary form must reproduce the above */
|
|
/* copyright notice, this list of conditions and the following */
|
|
/* disclaimer in the documentation and/or other materials */
|
|
/* provided with the distribution. */
|
|
/* */
|
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
/* */
|
|
/* The views and conclusions contained in the software and */
|
|
/* documentation are those of the authors and should not be */
|
|
/* interpreted as representing official policies, either expressed */
|
|
/* or implied, of The University of Texas at Austin. */
|
|
/*********************************************************************/
|
|
|
|
#define ASSEMBLER
|
|
#include "common.h"
|
|
|
|
#ifdef ATOM
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 24)
|
|
#endif
|
|
|
|
#ifdef CORE2
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 24)
|
|
#endif
|
|
|
|
#if defined(PENRYN) || defined(DUNNINGTON)
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 24)
|
|
#endif
|
|
|
|
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 24)
|
|
#endif
|
|
|
|
#ifdef PENTIUM4
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 28)
|
|
#endif
|
|
|
|
#ifdef OPTERON
|
|
#define PREFETCH prefetch
|
|
#define PREFETCHW prefetchw
|
|
#define PREFETCHSIZE (16 * 12)
|
|
#define movsd movlpd
|
|
#endif
|
|
|
|
#ifdef NANO
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 24)
|
|
#endif
|
|
|
|
#ifdef GENERIC
|
|
#define PREFETCH prefetcht0
|
|
#define PREFETCHW prefetcht0
|
|
#define PREFETCHSIZE (16 * 12)
|
|
#endif
|
|
|
|
#ifndef WINDOWS_ABI
|
|
|
|
#define STACKSIZE 80
|
|
|
|
#define OLD_Y 8 + STACKSIZE(%rsp)
|
|
#define OLD_INCY 16 + STACKSIZE(%rsp)
|
|
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
|
|
|
|
#define M ARG1
|
|
#define N ARG2
|
|
#define A ARG3
|
|
#define LDA ARG4
|
|
#define X ARG5
|
|
#define INCX ARG6
|
|
|
|
#else
|
|
|
|
#define STACKSIZE 256
|
|
|
|
#define OLD_A 40 + STACKSIZE(%rsp)
|
|
#define OLD_LDA 48 + STACKSIZE(%rsp)
|
|
#define OLD_X 56 + STACKSIZE(%rsp)
|
|
#define OLD_INCX 64 + STACKSIZE(%rsp)
|
|
#define OLD_Y 72 + STACKSIZE(%rsp)
|
|
#define OLD_INCY 80 + STACKSIZE(%rsp)
|
|
#define OLD_BUFFER 88 + STACKSIZE(%rsp)
|
|
|
|
#define M ARG1
|
|
#define N ARG2
|
|
#define A ARG4
|
|
#define LDA ARG3
|
|
#define X %rdi
|
|
#define INCX %rsi
|
|
#endif
|
|
|
|
#define Y %r10
|
|
#define INCY %r11
|
|
#define BUFFER %r12
|
|
|
|
#define TEMP %rax
|
|
#define I %rax
|
|
#define A1 %rbx
|
|
#define A2 %rbp
|
|
#define XX %r13
|
|
#define YY %r14
|
|
#define IS %r15
|
|
#define NEW_X BUFFER
|
|
#define NEW_Y X
|
|
|
|
#define ALPHA_R %xmm0
|
|
#define ALPHA_I %xmm1
|
|
|
|
#define xtemp1 %xmm0
|
|
#define xtemp2 %xmm1
|
|
#define xtemp3 %xmm2
|
|
#define xtemp4 %xmm3
|
|
|
|
#define atemp1 %xmm4
|
|
#define atemp2 %xmm5
|
|
#define atemp3 %xmm6
|
|
#define atemp4 %xmm7
|
|
|
|
#define xsum1 %xmm8
|
|
#define xsum2 %xmm9
|
|
#define yy1 %xmm10
|
|
#define yy2 %xmm11
|
|
|
|
#define a1 %xmm12
|
|
#define a2 %xmm13
|
|
#define a3 %xmm14
|
|
#define xt1 %xmm15
|
|
|
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
|
#else
|
|
#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
|
|
#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
|
|
#endif
|
|
|
|
PROLOGUE
|
|
PROFCODE
|
|
|
|
subq $STACKSIZE, %rsp
|
|
movq %rbx, 0(%rsp)
|
|
movq %rbp, 8(%rsp)
|
|
movq %r12, 16(%rsp)
|
|
movq %r13, 24(%rsp)
|
|
movq %r14, 32(%rsp)
|
|
movq %r15, 40(%rsp)
|
|
|
|
movq OLD_Y, Y
|
|
movq OLD_INCY, INCY
|
|
movq OLD_BUFFER, BUFFER
|
|
|
|
salq $ZBASE_SHIFT, INCX
|
|
salq $ZBASE_SHIFT, INCY
|
|
salq $ZBASE_SHIFT, LDA
|
|
|
|
testq M, M
|
|
jle .L999
|
|
|
|
pcmpeqb %xmm2, %xmm2
|
|
xorpd %xmm3, %xmm3
|
|
psllq $63, %xmm2
|
|
unpcklpd %xmm3, %xmm2
|
|
|
|
unpcklpd ALPHA_I, ALPHA_R
|
|
unpcklpd ALPHA_R, ALPHA_I
|
|
xorpd %xmm2, ALPHA_I
|
|
|
|
movq BUFFER, XX
|
|
|
|
movq M, %rax
|
|
sarq $2, %rax
|
|
jle .L02
|
|
ALIGN_3
|
|
|
|
.L01:
|
|
MOVDDUP(0 * SIZE, X, %xmm3)
|
|
MOVDDUP(1 * SIZE, X, %xmm4)
|
|
addq INCX, X
|
|
MOVDDUP(0 * SIZE, X, %xmm5)
|
|
MOVDDUP(1 * SIZE, X, %xmm6)
|
|
addq INCX, X
|
|
|
|
mulpd ALPHA_R, %xmm3
|
|
mulpd ALPHA_I, %xmm4
|
|
mulpd ALPHA_R, %xmm5
|
|
mulpd ALPHA_I, %xmm6
|
|
|
|
addpd %xmm4, %xmm3
|
|
addpd %xmm6, %xmm5
|
|
|
|
movapd %xmm3, 0 * SIZE(XX)
|
|
SHUFPD_1 %xmm3, %xmm3
|
|
pxor %xmm2, %xmm3
|
|
movapd %xmm3, 2 * SIZE(XX)
|
|
|
|
movapd %xmm5, 4 * SIZE(XX)
|
|
SHUFPD_1 %xmm5, %xmm5
|
|
pxor %xmm2, %xmm5
|
|
movapd %xmm5, 6 * SIZE(XX)
|
|
|
|
MOVDDUP(0 * SIZE, X, %xmm3)
|
|
MOVDDUP(1 * SIZE, X, %xmm4)
|
|
addq INCX, X
|
|
MOVDDUP(0 * SIZE, X, %xmm5)
|
|
MOVDDUP(1 * SIZE, X, %xmm6)
|
|
addq INCX, X
|
|
|
|
mulpd ALPHA_R, %xmm3
|
|
mulpd ALPHA_I, %xmm4
|
|
mulpd ALPHA_R, %xmm5
|
|
mulpd ALPHA_I, %xmm6
|
|
|
|
addpd %xmm4, %xmm3
|
|
addpd %xmm6, %xmm5
|
|
|
|
movapd %xmm3, 8 * SIZE(XX)
|
|
SHUFPD_1 %xmm3, %xmm3
|
|
pxor %xmm2, %xmm3
|
|
movapd %xmm3, 10 * SIZE(XX)
|
|
|
|
movapd %xmm5, 12 * SIZE(XX)
|
|
SHUFPD_1 %xmm5, %xmm5
|
|
pxor %xmm2, %xmm5
|
|
movapd %xmm5, 14 * SIZE(XX)
|
|
|
|
subq $-16 * SIZE, XX
|
|
decq %rax
|
|
jg .L01
|
|
ALIGN_3
|
|
|
|
.L02:
|
|
movq M, %rax
|
|
andq $3, %rax
|
|
jle .L05
|
|
ALIGN_3
|
|
|
|
.L03:
|
|
MOVDDUP(0 * SIZE, X, %xmm3)
|
|
MOVDDUP(1 * SIZE, X, %xmm4)
|
|
addq INCX, X
|
|
|
|
mulpd ALPHA_R, %xmm3
|
|
mulpd ALPHA_I, %xmm4
|
|
|
|
addpd %xmm4, %xmm3
|
|
|
|
movapd %xmm3, 0 * SIZE(XX)
|
|
SHUFPD_1 %xmm3, %xmm3
|
|
pxor %xmm2, %xmm3
|
|
movapd %xmm3, 2 * SIZE(XX)
|
|
|
|
addq $4 * SIZE, XX
|
|
decq %rax
|
|
jg .L03
|
|
ALIGN_3
|
|
|
|
.L05:
|
|
/* now we don't need original X */
|
|
movq Y, NEW_Y
|
|
|
|
addq $512, XX
|
|
andq $-512, XX
|
|
|
|
cmpq $2 * SIZE, INCY
|
|
je .L10
|
|
|
|
movq Y, YY
|
|
movq XX, NEW_Y
|
|
|
|
movq M, %rax
|
|
sarq $2, %rax
|
|
jle .L07
|
|
ALIGN_3
|
|
|
|
.L06:
|
|
movsd 0 * SIZE(YY), %xmm0
|
|
movhpd 1 * SIZE(YY), %xmm0
|
|
addq INCY, YY
|
|
movsd 0 * SIZE(YY), %xmm1
|
|
movhpd 1 * SIZE(YY), %xmm1
|
|
addq INCY, YY
|
|
movsd 0 * SIZE(YY), %xmm2
|
|
movhpd 1 * SIZE(YY), %xmm2
|
|
addq INCY, YY
|
|
movsd 0 * SIZE(YY), %xmm3
|
|
movhpd 1 * SIZE(YY), %xmm3
|
|
addq INCY, YY
|
|
|
|
movapd %xmm0, 0 * SIZE(XX)
|
|
movapd %xmm1, 2 * SIZE(XX)
|
|
movapd %xmm2, 4 * SIZE(XX)
|
|
movapd %xmm3, 6 * SIZE(XX)
|
|
|
|
addq $8 * SIZE, XX
|
|
decq %rax
|
|
jg .L06
|
|
ALIGN_3
|
|
|
|
.L07:
|
|
movq M, %rax
|
|
andq $3, %rax
|
|
jle .L10
|
|
ALIGN_3
|
|
|
|
.L08:
|
|
movsd 0 * SIZE(YY), %xmm0
|
|
movhpd 1 * SIZE(YY), %xmm0
|
|
addq INCY, YY
|
|
|
|
movapd %xmm0, 0 * SIZE(XX)
|
|
|
|
addq $2 * SIZE, XX
|
|
decq %rax
|
|
jg .L08
|
|
ALIGN_3
|
|
|
|
.L10:
|
|
xorq IS, IS # is = 0
|
|
|
|
cmpq $2, N
|
|
jl .L20
|
|
ALIGN_3
|
|
|
|
.L11:
|
|
movq A, A1
|
|
leaq (A, LDA, 1), A2
|
|
leaq 4 * SIZE(A, LDA, 2), A
|
|
|
|
leaq (, IS, SIZE), I
|
|
|
|
leaq 0 * SIZE(NEW_X, I, 4), XX
|
|
leaq 4 * SIZE(NEW_Y, I, 2), YY
|
|
|
|
movapd 0 * SIZE(XX), atemp1
|
|
movapd 2 * SIZE(XX), atemp2
|
|
movapd 4 * SIZE(XX), atemp3
|
|
movapd 6 * SIZE(XX), atemp4
|
|
|
|
MOVDDUP(0 * SIZE, A1, xsum1)
|
|
MOVDDUP(2 * SIZE, A1, xsum2)
|
|
|
|
mulpd atemp1, xsum1
|
|
mulpd atemp1, xsum2
|
|
|
|
MOVDDUP(1 * SIZE, A1, a1)
|
|
MOVDDUP(3 * SIZE, A1, a2)
|
|
|
|
mulpd atemp2, a1
|
|
mulpd atemp2, a2
|
|
addpd a1, xsum1
|
|
addpd a2, xsum2
|
|
|
|
MOVDDUP(2 * SIZE, A1, a1)
|
|
MOVDDUP(2 * SIZE, A2, a2)
|
|
|
|
mulpd atemp3, a1
|
|
mulpd atemp3, a2
|
|
addpd a1, xsum1
|
|
addpd a2, xsum2
|
|
|
|
MOVDDUP(3 * SIZE, A1, a1)
|
|
MOVDDUP(3 * SIZE, A2, a2)
|
|
|
|
mulpd atemp4, a1
|
|
mulpd atemp4, a2
|
|
addpd a1, xsum1
|
|
addpd a2, xsum2
|
|
|
|
MOVDDUP(4 * SIZE, A1, a1)
|
|
MOVDDUP(6 * SIZE, A2, a2)
|
|
|
|
movsd 0 * SIZE(YY), yy1
|
|
movhpd 1 * SIZE(YY), yy1
|
|
movsd 2 * SIZE(YY), yy2
|
|
movhpd 3 * SIZE(YY), yy2
|
|
|
|
movapd 8 * SIZE(XX), xtemp1
|
|
movapd 10 * SIZE(XX), xtemp2
|
|
movapd 12 * SIZE(XX), xtemp3
|
|
movapd 14 * SIZE(XX), xtemp4
|
|
|
|
addq $8 * SIZE, XX
|
|
addq $4 * SIZE, A1
|
|
addq $4 * SIZE, A2
|
|
|
|
movq M, I
|
|
subq IS, I
|
|
subq $2, I
|
|
sarq $2, I
|
|
jle .L15
|
|
ALIGN_3
|
|
|
|
.L12:
|
|
movapd xtemp1, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
MOVDDUP(1 * SIZE, A1, a1)
|
|
|
|
PREFETCH PREFETCHSIZE(A1)
|
|
|
|
movapd xtemp3, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp3, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy2
|
|
MOVDDUP(3 * SIZE, A2, a2)
|
|
|
|
movapd xtemp2, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp2, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
MOVDDUP(2 * SIZE, A1, a1)
|
|
|
|
movapd xtemp4, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy2
|
|
MOVDDUP(0 * SIZE, A2, a2)
|
|
|
|
PREFETCH PREFETCHSIZE(XX)
|
|
|
|
movapd xtemp3, xt1
|
|
movapd 12 * SIZE(XX), xtemp3
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy2
|
|
MOVDDUP(3 * SIZE, A1, a1)
|
|
|
|
movapd xtemp1, xt1
|
|
movapd 8 * SIZE(XX), xtemp1
|
|
mulpd a2, xt1
|
|
mulpd atemp3, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
MOVDDUP(1 * SIZE, A2, a2)
|
|
|
|
movapd xtemp4, xt1
|
|
movapd 14 * SIZE(XX), xtemp4
|
|
mulpd a1, xt1
|
|
mulpd atemp2, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy2
|
|
MOVDDUP(4 * SIZE, A1, a1)
|
|
|
|
movlpd yy2, 2 * SIZE(YY)
|
|
movhpd yy2, 3 * SIZE(YY)
|
|
movsd 6 * SIZE(YY), yy2
|
|
movhpd 7 * SIZE(YY), yy2
|
|
|
|
movapd xtemp2, xt1
|
|
movapd 10 * SIZE(XX), xtemp2
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
MOVDDUP(6 * SIZE, A2, a2)
|
|
|
|
PREFETCH PREFETCHSIZE(A2)
|
|
|
|
movlpd yy1, 0 * SIZE(YY)
|
|
movhpd yy1, 1 * SIZE(YY)
|
|
movsd 4 * SIZE(YY), yy1
|
|
movhpd 5 * SIZE(YY), yy1
|
|
|
|
movapd xtemp1, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
MOVDDUP(5 * SIZE, A1, a1)
|
|
|
|
movapd xtemp3, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp3, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy2
|
|
MOVDDUP(7 * SIZE, A2, a2)
|
|
|
|
movapd xtemp2, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp2, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
MOVDDUP(6 * SIZE, A1, a1)
|
|
|
|
PREFETCHW PREFETCHSIZE(YY)
|
|
|
|
movapd xtemp4, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy2
|
|
MOVDDUP(4 * SIZE, A2, a2)
|
|
|
|
movapd xtemp3, xt1
|
|
movapd 20 * SIZE(XX), xtemp3
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy2
|
|
MOVDDUP(7 * SIZE, A1, a1)
|
|
|
|
movapd xtemp1, xt1
|
|
movapd 16 * SIZE(XX), xtemp1
|
|
mulpd a2, xt1
|
|
mulpd atemp3, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
MOVDDUP(5 * SIZE, A2, a2)
|
|
|
|
movapd xtemp4, xt1
|
|
movapd 22 * SIZE(XX), xtemp4
|
|
mulpd a1, xt1
|
|
mulpd atemp2, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy2
|
|
MOVDDUP( 8 * SIZE, A1, a1)
|
|
|
|
movlpd yy2, 6 * SIZE(YY)
|
|
movhpd yy2, 7 * SIZE(YY)
|
|
movsd 10 * SIZE(YY), yy2
|
|
movhpd 11 * SIZE(YY), yy2
|
|
|
|
movapd xtemp2, xt1
|
|
movapd 18 * SIZE(XX), xtemp2
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
MOVDDUP(10 * SIZE, A2, a2)
|
|
|
|
movlpd yy1, 4 * SIZE(YY)
|
|
movhpd yy1, 5 * SIZE(YY)
|
|
movsd 8 * SIZE(YY), yy1
|
|
movhpd 9 * SIZE(YY), yy1
|
|
|
|
subq $-16 * SIZE, XX
|
|
addq $ 8 * SIZE, YY
|
|
addq $ 8 * SIZE, A1
|
|
addq $ 8 * SIZE, A2
|
|
|
|
decq I
|
|
jg .L12
|
|
ALIGN_3
|
|
|
|
.L15:
|
|
movq M, I
|
|
subq IS, I
|
|
subq $2, I
|
|
testq $2, I
|
|
jle .L16
|
|
|
|
movapd xtemp1, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
MOVDDUP(1 * SIZE, A1, a1)
|
|
|
|
movapd xtemp3, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp3, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy2
|
|
MOVDDUP(3 * SIZE, A2, a2)
|
|
|
|
movapd xtemp2, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp2, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
MOVDDUP(2 * SIZE, A1, a1)
|
|
|
|
movapd xtemp4, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy2
|
|
MOVDDUP(0 * SIZE, A2, a2)
|
|
|
|
movapd xtemp3, xt1
|
|
movapd 12 * SIZE(XX), xtemp3
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy2
|
|
MOVDDUP(3 * SIZE, A1, a1)
|
|
|
|
movapd xtemp1, xt1
|
|
movapd 8 * SIZE(XX), xtemp1
|
|
mulpd a2, xt1
|
|
mulpd atemp3, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
MOVDDUP(1 * SIZE, A2, a2)
|
|
|
|
movapd xtemp4, xt1
|
|
movapd 14 * SIZE(XX), xtemp4
|
|
mulpd a1, xt1
|
|
mulpd atemp2, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy2
|
|
MOVDDUP(4 * SIZE, A1, a1)
|
|
|
|
movlpd yy2, 2 * SIZE(YY)
|
|
movhpd yy2, 3 * SIZE(YY)
|
|
movsd 6 * SIZE(YY), yy2
|
|
movhpd 7 * SIZE(YY), yy2
|
|
|
|
movapd xtemp2, xt1
|
|
movapd 10 * SIZE(XX), xtemp2
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
|
|
movlpd yy1, 0 * SIZE(YY)
|
|
movhpd yy1, 1 * SIZE(YY)
|
|
movsd 4 * SIZE(YY), yy1
|
|
movhpd 5 * SIZE(YY), yy1
|
|
|
|
addq $4 * SIZE, YY
|
|
addq $4 * SIZE, A1
|
|
addq $4 * SIZE, A2
|
|
ALIGN_3
|
|
|
|
.L16:
|
|
testq $1, M
|
|
jle .L18
|
|
|
|
MOVDDUP(1 * SIZE, A1, a2)
|
|
|
|
movapd xtemp1, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp1, a1
|
|
addpd xt1, xsum1
|
|
addpd a1, yy1
|
|
|
|
MOVDDUP(0 * SIZE, A2, a1)
|
|
|
|
movapd xtemp2, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp2, a2
|
|
addpd xt1, xsum1
|
|
addpd a2, yy1
|
|
|
|
MOVDDUP(1 * SIZE, A2, a2)
|
|
|
|
movapd xtemp1, xt1
|
|
mulpd a1, xt1
|
|
mulpd atemp3, a1
|
|
addpd xt1, xsum2
|
|
addpd a1, yy1
|
|
|
|
movapd xtemp2, xt1
|
|
mulpd a2, xt1
|
|
mulpd atemp4, a2
|
|
addpd xt1, xsum2
|
|
addpd a2, yy1
|
|
|
|
movlpd yy1, 0 * SIZE(YY)
|
|
movhpd yy1, 1 * SIZE(YY)
|
|
ALIGN_3
|
|
|
|
.L18:
|
|
leaq (, IS, SIZE), I
|
|
|
|
movsd 0 * SIZE(NEW_Y, I, 2), yy1
|
|
movhpd 1 * SIZE(NEW_Y, I, 2), yy1
|
|
movsd 2 * SIZE(NEW_Y, I, 2), yy2
|
|
movhpd 3 * SIZE(NEW_Y, I, 2), yy2
|
|
|
|
addpd xsum1, yy1
|
|
addpd xsum2, yy2
|
|
|
|
movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
|
|
movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
|
|
movlpd yy2, 2 * SIZE(NEW_Y, I, 2)
|
|
movhpd yy2, 3 * SIZE(NEW_Y, I, 2)
|
|
|
|
addq $2, IS
|
|
|
|
movq IS, I
|
|
addq $2, I
|
|
cmpq M, I
|
|
jle .L11
|
|
ALIGN_3
|
|
|
|
.L20:
|
|
HALT
|
|
testq $1, N
|
|
jle .L990
|
|
|
|
leaq (, IS, SIZE), I
|
|
|
|
movapd 0 * SIZE(NEW_X, I, 4), atemp1
|
|
movapd 2 * SIZE(NEW_X, I, 4), atemp2
|
|
|
|
movsd 0 * SIZE(NEW_Y, I, 2), yy1
|
|
movhpd 1 * SIZE(NEW_Y, I, 2), yy1
|
|
|
|
MOVDDUP(0 * SIZE, A, a1)
|
|
MOVDDUP(1 * SIZE, A, a2)
|
|
|
|
mulpd atemp1, a1
|
|
mulpd atemp2, a2
|
|
addpd a1, yy1
|
|
addpd a2, yy1
|
|
|
|
movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
|
|
movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
|
|
ALIGN_3
|
|
|
|
.L990:
|
|
cmpq $2 * SIZE, INCY
|
|
je .L999
|
|
|
|
movq M, %rax
|
|
sarq $2, %rax
|
|
jle .L997
|
|
ALIGN_3
|
|
|
|
.L996:
|
|
movapd 0 * SIZE(NEW_Y), %xmm0
|
|
movapd 2 * SIZE(NEW_Y), %xmm1
|
|
movapd 4 * SIZE(NEW_Y), %xmm2
|
|
movapd 6 * SIZE(NEW_Y), %xmm3
|
|
|
|
movsd %xmm0, 0 * SIZE(Y)
|
|
movhpd %xmm0, 1 * SIZE(Y)
|
|
addq INCY, Y
|
|
movsd %xmm1, 0 * SIZE(Y)
|
|
movhpd %xmm1, 1 * SIZE(Y)
|
|
addq INCY, Y
|
|
movsd %xmm2, 0 * SIZE(Y)
|
|
movhpd %xmm2, 1 * SIZE(Y)
|
|
addq INCY, Y
|
|
movsd %xmm3, 0 * SIZE(Y)
|
|
movhpd %xmm3, 1 * SIZE(Y)
|
|
addq INCY, Y
|
|
|
|
addq $8 * SIZE, NEW_Y
|
|
decq %rax
|
|
jg .L996
|
|
ALIGN_3
|
|
|
|
.L997:
|
|
movq M, %rax
|
|
andq $3, %rax
|
|
jle .L999
|
|
ALIGN_3
|
|
|
|
.L998:
|
|
movapd 0 * SIZE(NEW_Y), %xmm0
|
|
|
|
movsd %xmm0, 0 * SIZE(Y)
|
|
movhpd %xmm0, 1 * SIZE(Y)
|
|
addq INCY, Y
|
|
|
|
addq $2 * SIZE, NEW_Y
|
|
|
|
decq %rax
|
|
jg .L998
|
|
ALIGN_3
|
|
|
|
.L999:
|
|
movq 0(%rsp), %rbx
|
|
movq 8(%rsp), %rbp
|
|
movq 16(%rsp), %r12
|
|
movq 24(%rsp), %r13
|
|
movq 32(%rsp), %r14
|
|
movq 40(%rsp), %r15
|
|
addq $STACKSIZE, %rsp
|
|
ret
|
|
EPILOGUE
|