OpenBLAS/kernel/x86_64/zgemm_kernel_2x2_piledriver.S

1430 lines
38 KiB
ArmAsm

/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/*********************************************************************
*
* 2014/06/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
*
* 2013/10/30 Saar
*
* Parameter:
* UNROLL_M 2
* UNROLL_N 2
* ZGEMM_P 384
* ZGEMM_Q 168
* A_PR1 512
* B_PR1 256
*
* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
*
* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 )
* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 )
* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 )
* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 )
*
* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
*
* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 )
* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 )
* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 )
* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 )
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
#define J %r14
#define OLD_K %rdx
#define A %rcx
#define B %r8
#define C %r9
#define LDC %r10
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define K %r12
#define BI %rbp
#define SP %rbx
#define BO1 %rdi
#define BO2 %r15
#ifndef WINDOWS_ABI
#define STACKSIZE 96
#else
#define STACKSIZE 320
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_B 56 + STACKSIZE(%rsp)
#define OLD_C 64 + STACKSIZE(%rsp)
#define OLD_LDC 72 + STACKSIZE(%rsp)
#define OLD_OFFSET 80 + STACKSIZE(%rsp)
#endif
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
#define N 40(%rsp)
#define ALPHA_R 48(%rsp)
#define ALPHA_I 56(%rsp)
#define OFFSET 64(%rsp)
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
#define STACK_TOUCH \
movl $0, 4096 * 4(%rsp);\
movl $0, 4096 * 3(%rsp);\
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 12288
#define STACK_TOUCH \
movl $0, 4096 * 3(%rsp);\
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 8192
#define STACK_TOUCH \
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 4096
#define STACK_TOUCH \
movl $0, 4096 * 1(%rsp);
#else
#define STACK_TOUCH
#endif
#else
#define STACK_TOUCH
#endif
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define VFMADD_R vfmaddpd
#define VFMADD_I vfmaddpd
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define VFMADD_R vfnmaddpd
#define VFMADD_I vfmaddpd
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define VFMADD_R vfmaddpd
#define VFMADD_I vfnmaddpd
#else
#define VFMADD_R vfnmaddpd
#define VFMADD_I vfnmaddpd
#endif
#define A_PR1 512
#define B_PR1 256
#define KERNEL2x2_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
#define KERNEL2x2_2(xx) \
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
#define KERNEL2x2_3(xx) \
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
#define KERNEL2x2_4(xx) \
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
addq $16, BI ;\
addq $16, %rax ;\
#define KERNEL2x2_SUB(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
addq $4, BI ;\
addq $4, %rax ;\
/************************************************************************************************/
#define KERNEL1x2_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL1x2_2(xx) \
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL1x2_3(xx) \
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL1x2_4(xx) \
vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
addq $16, BI ;\
addq $8 , %rax ;\
#define KERNEL1x2_SUB(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
addq $4, BI ;\
addq $2, %rax ;\
/************************************************************************************************/
#define KERNEL2x1_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
#define KERNEL2x1_2(xx) \
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
#define KERNEL2x1_3(xx) \
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
#define KERNEL2x1_4(xx) \
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
addq $8, BI ;\
addq $16, %rax ;\
#define KERNEL2x1_SUB(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
addq $2, BI ;\
addq $4, %rax ;\
/************************************************************************************************/
#define KERNEL1x1_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL1x1_2(xx) \
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL1x1_3(xx) \
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL1x1_4(xx) \
vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
addq $8, BI ;\
addq $8, %rax ;\
#define KERNEL1x1_SUB(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
addq $2, BI ;\
addq $2, %rax ;\
/************************************************************************************************/
PROLOGUE
PROFCODE
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $128 + L_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCH
cmpq $0, OLD_M
je .L999
cmpq $0, OLD_N
je .L999
cmpq $0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovsd %xmm0, ALPHA_R
vmovsd %xmm1, ALPHA_I
salq $ZBASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $2, %rdi
divq %rdi // N / 2
movq %rax, Ndiv6 // N / 2
movq %rdx, Nmod6 // N % 2
#ifdef TRMMKERNEL
vmovsd %xmm12, OFFSET
vmovsd %xmm12, KK
#ifndef LEFT
negq KK
#endif
#endif
.L2_0:
movq Ndiv6, J
cmpq $0, J
je .L1_0
ALIGN_4
.L2_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L2_02b:
vmovups (BO1), %xmm0
vmovups 2 * SIZE(BO1), %xmm1
vmovups %xmm0, (BO)
vmovups %xmm1, 2 * SIZE(BO)
addq $4*SIZE,BO1
addq $4*SIZE,BO
decq %rax
jnz .L2_02b
.L2_02c:
movq BO1, B // next offset of B
.L2_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $8 * SIZE, AO
movq M, I
sarq $1, I // i = (m >> 1)
je .L2_40
ALIGN_4
.L2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_12:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
je .L2_16
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
je .L2_16
jmp .L2_12
ALIGN_4
.L2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_17:
KERNEL2x2_SUB(xxx)
jl .L2_17
ALIGN_4
.L2_19:
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
vshufpd $0x01, %xmm11, %xmm11, %xmm11
vshufpd $0x01, %xmm13, %xmm13, %xmm13
vshufpd $0x01, %xmm15, %xmm15, %xmm15
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm13,%xmm12, %xmm12
vaddsubpd %xmm15,%xmm14, %xmm14
vshufpd $0x01, %xmm8 , %xmm8, %xmm9
vshufpd $0x01, %xmm10, %xmm10, %xmm11
vshufpd $0x01, %xmm12, %xmm12, %xmm13
vshufpd $0x01, %xmm14, %xmm14, %xmm15
#else
vaddsubpd %xmm8, %xmm9 ,%xmm9
vaddsubpd %xmm10, %xmm11,%xmm11
vaddsubpd %xmm12, %xmm13,%xmm13
vaddsubpd %xmm14, %xmm15,%xmm15
vmovapd %xmm9, %xmm8
vmovapd %xmm11, %xmm10
vmovapd %xmm13, %xmm12
vmovapd %xmm15, %xmm14
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
vshufpd $0x01, %xmm11, %xmm11, %xmm11
vshufpd $0x01, %xmm13, %xmm13, %xmm13
vshufpd $0x01, %xmm15, %xmm15, %xmm15
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm10, %xmm0, %xmm10
vmulpd %xmm12, %xmm0, %xmm12
vmulpd %xmm14, %xmm0, %xmm14
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm11, %xmm1, %xmm11
vmulpd %xmm13, %xmm1, %xmm13
vmulpd %xmm15, %xmm1, %xmm15
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm13,%xmm12, %xmm12
vaddsubpd %xmm15,%xmm14, %xmm14
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
vaddpd (CO1, LDC), %xmm10, %xmm10
vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 2 * SIZE(CO1)
vmovups %xmm10 , (CO1, LDC)
vmovups %xmm14 , 2 * SIZE(CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L2_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_40:
testq $1, M
jz .L2_60 // to next 2 lines of N
ALIGN_4
.L2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_46
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_42:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
je .L2_46
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
prefetcht0 B_PR1+64(BO,BI,SIZE)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
je .L2_46
jmp .L2_42
ALIGN_4
.L2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_49
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_47:
KERNEL1x2_SUB(xxx)
jl .L2_47
ALIGN_4
.L2_49:
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
vshufpd $0x01, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vshufpd $0x01, %xmm8 , %xmm8, %xmm9
vshufpd $0x01, %xmm10, %xmm10, %xmm11
#else
vaddsubpd %xmm8, %xmm9, %xmm9
vaddsubpd %xmm10,%xmm11, %xmm11
vmovapd %xmm9, %xmm8
vmovapd %xmm11, %xmm10
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
vshufpd $0x01, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm11, %xmm1, %xmm11
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd (CO1, LDC), %xmm10, %xmm10
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
decq J // j --
jg .L2_01 // next 2 lines of N
.L1_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $1, J // j % 2
je .L999
ALIGN_4
.L1_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_02b:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $2*SIZE,BO1
addq $2*SIZE,BO
decq %rax
jnz .L1_02b
.L1_02c:
movq BO1, B // next offset of B
.L1_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $8 * SIZE, AO
movq M, I
sarq $1, I // i = (m >> 1)
je .L1_40
ALIGN_4
.L1_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_12:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
je .L1_16
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
je .L1_16
jmp .L1_12
ALIGN_4
.L1_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_17:
KERNEL2x1_SUB(xxx)
jl .L1_17
ALIGN_4
.L1_19:
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
vshufpd $0x01, %xmm13, %xmm13, %xmm13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm13,%xmm12 , %xmm12
vshufpd $0x01, %xmm8 , %xmm8, %xmm9
vshufpd $0x01, %xmm12, %xmm12, %xmm13
#else
vaddsubpd %xmm8, %xmm9 , %xmm9
vaddsubpd %xmm12,%xmm13, %xmm13
vmovapd %xmm9, %xmm8
vmovapd %xmm13, %xmm12
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
vshufpd $0x01, %xmm13, %xmm13, %xmm13
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm12, %xmm0, %xmm12
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm13, %xmm1, %xmm13
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm13, %xmm12, %xmm12
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 2 * SIZE(CO1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L1_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_40:
testq $1, M
jz .L999
ALIGN_4
.L1_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_46
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_42:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
je .L1_46
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
je .L1_46
jmp .L1_42
ALIGN_4
.L1_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_49
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_47:
KERNEL1x1_SUB(xxx)
jl .L1_47
ALIGN_4
.L1_49:
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8, %xmm8
vshufpd $0x01, %xmm8 , %xmm8, %xmm9
#else
vaddsubpd %xmm8, %xmm9, %xmm9
vmovapd %xmm9, %xmm8
// swap high and low 64 bytes
vshufpd $0x01, %xmm9 , %xmm9, %xmm9
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vaddsubpd %xmm9 ,%xmm8, %xmm8
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
#endif
vmovups %xmm8 , (CO1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L999:
vzeroupper
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
ret
EPILOGUE