OpenBLAS/kernel/x86_64/cgemm_kernel_4x2_piledriver.S

1922 lines
50 KiB
ArmAsm

/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/*********************************************************************
*
* 2014/06/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
*
* 2013/10/31 Saar
*
* Parameter:
* UNROLL_M 4
* UNROLL_N 2
* CGEMM_P 768
* CGEMM_Q 168
* A_PR1 512
* B_PR1 256
*
* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
*
* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 )
* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 )
* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 )
* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 )
*
* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
*
* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 )
* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 )
* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 )
* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 )
* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 )
* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 )
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
#define J %r14
#define OLD_K %rdx
#define A %rcx
#define B %r8
#define C %r9
#define LDC %r10
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define K %r12
#define BI %rbp
#define SP %rbx
#define BO1 %rdi
#define BO2 %r15
#ifndef WINDOWS_ABI
#define STACKSIZE 96
#else
#define STACKSIZE 320
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_B 56 + STACKSIZE(%rsp)
#define OLD_C 64 + STACKSIZE(%rsp)
#define OLD_LDC 72 + STACKSIZE(%rsp)
#define OLD_OFFSET 80 + STACKSIZE(%rsp)
#endif
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
#define N 40(%rsp)
#define ALPHA_R 48(%rsp)
#define ALPHA_I 56(%rsp)
#define OFFSET 64(%rsp)
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
#define STACK_TOUCH \
movl $0, 4096 * 4(%rsp);\
movl $0, 4096 * 3(%rsp);\
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 12288
#define STACK_TOUCH \
movl $0, 4096 * 3(%rsp);\
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 8192
#define STACK_TOUCH \
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 4096
#define STACK_TOUCH \
movl $0, 4096 * 1(%rsp);
#else
#define STACK_TOUCH
#endif
#else
#define STACK_TOUCH
#endif
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define VFMADD_R vfmaddps
#define VFMADD_I vfmaddps
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define VFMADD_R vfnmaddps
#define VFMADD_I vfmaddps
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define VFMADD_R vfmaddps
#define VFMADD_I vfnmaddps
#else
#define VFMADD_R vfnmaddps
#define VFMADD_I vfnmaddps
#endif
#define A_PR1 512
#define B_PR1 256
#define KERNEL4x2_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
#define KERNEL4x2_2(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
#define KERNEL4x2_3(xx) \
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
#define KERNEL4x2_4(xx) \
vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
addq $16, BI ;\
addq $32, %rax ;\
#define KERNEL4x2_SUB(xx) \
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
addq $4, BI ;\
addq $8, %rax ;\
/************************************************************************************************/
#define KERNEL2x2_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL2x2_2(xx) \
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL2x2_3(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL2x2_4(xx) \
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
addq $16, BI ;\
addq $16, %rax ;\
#define KERNEL2x2_SUB(xx) \
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
addq $4, BI ;\
addq $4, %rax ;\
/************************************************************************************************/
#define KERNEL1x2_1(xx) \
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL1x2_2(xx) \
vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL1x2_3(xx) \
vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
#define KERNEL1x2_4(xx) \
vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
addq $16, BI ;\
addq $8, %rax ;\
#define KERNEL1x2_SUB(xx) \
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
addq $4, BI ;\
addq $2, %rax ;\
/************************************************************************************************/
#define KERNEL4x1_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
#define KERNEL4x1_2(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
#define KERNEL4x1_3(xx) \
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
#define KERNEL4x1_4(xx) \
vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
addq $8, BI ;\
addq $32, %rax ;\
#define KERNEL4x1_SUB(xx) \
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
addq $2, BI ;\
addq $8, %rax ;\
/************************************************************************************************/
#define KERNEL2x1_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL2x1_2(xx) \
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL2x1_3(xx) \
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL2x1_4(xx) \
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
addq $8, BI ;\
addq $16, %rax ;\
#define KERNEL2x1_SUB(xx) \
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
addq $2, BI ;\
addq $4, %rax ;\
/************************************************************************************************/
#define KERNEL1x1_1(xx) \
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL1x1_2(xx) \
vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL1x1_3(xx) \
vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
#define KERNEL1x1_4(xx) \
vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
addq $8, BI ;\
addq $8, %rax ;\
#define KERNEL1x1_SUB(xx) \
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
addq $2, BI ;\
addq $2, %rax ;\
/************************************************************************************************/
PROLOGUE
PROFCODE
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $128 + L_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCH
cmpq $0, OLD_M
je .L999
cmpq $0, OLD_N
je .L999
cmpq $0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovss %xmm0, ALPHA_R
vmovss %xmm1, ALPHA_I
salq $ZBASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $2, %rdi
divq %rdi // N / 2
movq %rax, Ndiv6 // N / 2
movq %rdx, Nmod6 // N % 2
#ifdef TRMMKERNEL
vmovsd %xmm12, OFFSET
vmovsd %xmm12, KK
#ifndef LEFT
negq KK
#endif
#endif
.L2_0:
movq Ndiv6, J
cmpq $0, J
je .L1_0
ALIGN_4
.L2_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L2_02b:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $4*SIZE,BO1
addq $4*SIZE,BO
decq %rax
jnz .L2_02b
.L2_02c:
movq BO1, B // next offset of B
.L2_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $16 * SIZE, AO
movq M, I
sarq $2, I // i = (m >> 2)
je .L2_20
ALIGN_4
.L2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_12:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx)
je .L2_16
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx)
je .L2_16
jmp .L2_12
ALIGN_4
.L2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_17:
KERNEL4x2_SUB(xxx)
jl .L2_17
ALIGN_4
.L2_19:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm11, %xmm11, %xmm11
vshufps $0xb1, %xmm13, %xmm13, %xmm13
vshufps $0xb1, %xmm15, %xmm15, %xmm15
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm15,%xmm14, %xmm14
vshufps $0xb1, %xmm8 , %xmm8, %xmm9
vshufps $0xb1, %xmm10, %xmm10, %xmm11
vshufps $0xb1, %xmm12, %xmm12, %xmm13
vshufps $0xb1, %xmm14, %xmm14, %xmm15
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vaddsubps %xmm12, %xmm13,%xmm13
vaddsubps %xmm14, %xmm15,%xmm15
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
vmovaps %xmm13, %xmm12
vmovaps %xmm15, %xmm14
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm11, %xmm11, %xmm11
vshufps $0xb1, %xmm13, %xmm13, %xmm13
vshufps $0xb1, %xmm15, %xmm15, %xmm15
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
vmulps %xmm12, %xmm0, %xmm12
vmulps %xmm14, %xmm0, %xmm14
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vmulps %xmm13, %xmm1, %xmm13
vmulps %xmm15, %xmm1, %xmm15
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm15,%xmm14, %xmm14
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
vaddps 4 * SIZE(CO1), %xmm12, %xmm12
vaddps (CO1, LDC), %xmm10, %xmm10
vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 4 * SIZE(CO1)
vmovups %xmm10 , (CO1, LDC)
vmovups %xmm14 , 4 * SIZE(CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L2_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_20:
testq $3, M
jz .L2_60 // to next 2 lines of N
testq $2, M
jz .L2_40
ALIGN_4
.L2_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_26
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_22:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
je .L2_26
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx)
je .L2_26
jmp .L2_22
ALIGN_4
.L2_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_29
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_27:
KERNEL2x2_SUB(xxx)
jl .L2_27
ALIGN_4
.L2_29:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vshufps $0xb1, %xmm8 , %xmm8, %xmm9
vshufps $0xb1, %xmm10, %xmm10, %xmm11
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
vaddps (CO1, LDC), %xmm10, %xmm10
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************/
.L2_40:
testq $1, M
jz .L2_60 // to next 2 lines of N
ALIGN_4
.L2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_46
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_42:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
je .L2_46
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx)
je .L2_46
jmp .L2_42
ALIGN_4
.L2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_49
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_47:
KERNEL1x2_SUB(xxx)
jl .L2_47
ALIGN_4
.L2_49:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vshufps $0xb1, %xmm8 , %xmm8, %xmm9
vshufps $0xb1, %xmm10, %xmm10, %xmm11
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
#ifndef TRMMKERNEL
vmovsd (CO1), %xmm14
vaddps %xmm14, %xmm8 , %xmm8
vmovsd (CO1, LDC), %xmm15
vaddps %xmm15, %xmm10, %xmm10
#endif
vmovsd %xmm8 , (CO1)
vmovsd %xmm10 , (CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
decq J // j --
jg .L2_01 // next 2 lines of N
.L1_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $1, J // j % 2
je .L999
ALIGN_4
.L1_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_02b:
vmovsd (BO1), %xmm0
vmovsd %xmm0, (BO)
addq $2*SIZE,BO1
addq $2*SIZE,BO
decq %rax
jnz .L1_02b
.L1_02c:
movq BO1, B // next offset of B
.L1_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $16 * SIZE, AO
movq M, I
sarq $2, I // i = (m >> 2)
je .L1_20
ALIGN_4
.L1_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_12:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx)
KERNEL4x1_4(xxx)
KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx)
KERNEL4x1_4(xxx)
je .L1_16
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx)
KERNEL4x1_4(xxx)
KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx)
KERNEL4x1_4(xxx)
je .L1_16
jmp .L1_12
ALIGN_4
.L1_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_17:
KERNEL4x1_SUB(xxx)
jl .L1_17
ALIGN_4
.L1_19:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm13, %xmm13, %xmm13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
vshufps $0xb1, %xmm8 , %xmm8, %xmm9
vshufps $0xb1, %xmm12, %xmm12, %xmm13
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm12, %xmm13,%xmm13
vmovaps %xmm9, %xmm8
vmovaps %xmm13, %xmm12
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
vshufps $0xb1, %xmm13, %xmm13, %xmm13
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm12, %xmm0, %xmm12
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm13, %xmm1, %xmm13
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
vaddps 4 * SIZE(CO1), %xmm12, %xmm12
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 4 * SIZE(CO1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L1_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_20:
testq $3, M
jz .L999
testq $2, M
jz .L1_40
ALIGN_4
.L1_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_26
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_22:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
je .L1_26
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx)
KERNEL2x1_4(xxx)
je .L1_26
jmp .L1_22
ALIGN_4
.L1_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_29
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_27:
KERNEL2x1_SUB(xxx)
jl .L1_27
ALIGN_4
.L1_29:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vshufps $0xb1, %xmm8 , %xmm8, %xmm9
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vmovaps %xmm9, %xmm8
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vaddsubps %xmm9, %xmm8 , %xmm8
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
#endif
vmovups %xmm8 , (CO1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************/
.L1_40:
testq $1, M
jz .L999 // to next 2 lines of N
ALIGN_4
.L1_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_46
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_42:
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
je .L1_46
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx)
KERNEL1x1_4(xxx)
je .L1_46
jmp .L1_42
ALIGN_4
.L1_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_49
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_47:
KERNEL1x1_SUB(xxx)
jl .L1_47
ALIGN_4
.L1_49:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vshufps $0xb1, %xmm8 , %xmm8, %xmm9
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vmovaps %xmm9, %xmm8
// swap high and low 64 bytes
vshufps $0xb1, %xmm9 , %xmm9, %xmm9
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vaddsubps %xmm9, %xmm8 , %xmm8
#ifndef TRMMKERNEL
vmovsd (CO1), %xmm14
vaddps %xmm14, %xmm8 , %xmm8
#endif
vmovsd %xmm8 , (CO1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L999:
vzeroupper
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
ret
EPILOGUE