OpenBLAS/kernel/x86_64/zgemm_kernel_4x2_haswell.S

3882 lines
84 KiB
ArmAsm

/*********************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/********************************************************************************
* 2014/07/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
* 2013/10/28 Saar
* Parameter:
* ZGEMM_DEFAULT_UNROLL_N 2
* ZGEMM_DEFAULT_UNROLL_M 4
* ZGEMM_DEFAULT_P 256
* ZGEMM_DEFAULT_Q 128
* A_PR1 512
* B_PR1 512
*
* 2014/07/28 Saar
* Performance at 4608x4608x4608:
* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53)
* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100)
* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138)
* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172)
*
********************************************************************************/
#define ASSEMBLER
#include "common.h"
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
#define J %r14
#define OLD_K %rdx
#define A %rcx
#define B %r8
#define C %r9
#define LDC %r10
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define K %r12
#define BI %rbp
#define SP %rbx
#define BO1 %rdi
#define BO2 %r15
#ifndef WINDOWS_ABI
#define STACKSIZE 96
#else
#define STACKSIZE 320
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_B 56 + STACKSIZE(%rsp)
#define OLD_C 64 + STACKSIZE(%rsp)
#define OLD_LDC 72 + STACKSIZE(%rsp)
#define OLD_OFFSET 80 + STACKSIZE(%rsp)
#endif
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
#define N 40(%rsp)
#define ALPHA_R 48(%rsp)
#define ALPHA_I 56(%rsp)
#define OFFSET 64(%rsp)
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
#define STACK_TOUCH \
movl $ 0, 4096 * 4(%rsp);\
movl $ 0, 4096 * 3(%rsp);\
movl $ 0, 4096 * 2(%rsp);\
movl $ 0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 12288
#define STACK_TOUCH \
movl $ 0, 4096 * 3(%rsp);\
movl $ 0, 4096 * 2(%rsp);\
movl $ 0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 8192
#define STACK_TOUCH \
movl $ 0, 4096 * 2(%rsp);\
movl $ 0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 4096
#define STACK_TOUCH \
movl $ 0, 4096 * 1(%rsp);
#else
#define STACK_TOUCH
#endif
#else
#define STACK_TOUCH
#endif
#if defined(BULLDOZER)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
#else
#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
#endif
#else
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
#else
#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
#endif
#endif
#define A_PR1 512
#define B_PR1 512
/***************************************************************************************************/
.macro KERNEL4x3_SUB
vmovups (AO), %ymm0
vmovups 4 * SIZE(AO), %ymm1
prefetcht0 A_PR1(AO)
vbroadcastsd (BO), %ymm2
vbroadcastsd 1 * SIZE(BO), %ymm3
VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 )
VFMADDPD_R( %ymm12,%ymm2,%ymm1 )
VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 )
VFMADDPD_I( %ymm13,%ymm3,%ymm1 )
vbroadcastsd 2 * SIZE(BO), %ymm2
vbroadcastsd 3 * SIZE(BO), %ymm3
VFMADDPD_R( %ymm10,%ymm2,%ymm0 )
VFMADDPD_R( %ymm14,%ymm2,%ymm1 )
VFMADDPD_I( %ymm11,%ymm3,%ymm0 )
VFMADDPD_I( %ymm15,%ymm3,%ymm1 )
vbroadcastsd 4 * SIZE(BO), %ymm2
vbroadcastsd 5 * SIZE(BO), %ymm3
VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 )
VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 )
VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 )
VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 )
addq $ 6*SIZE, BO
addq $ 8*SIZE, AO
decq %rax
.endm
.macro SAVE4x3
vbroadcastsd ALPHA_R, %ymm0
vbroadcastsd ALPHA_I, %ymm1
// swap high and low 8 bytes
vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %ymm9, %ymm8 , %ymm8
vaddsubpd %ymm11,%ymm10, %ymm10
vaddsubpd %ymm13,%ymm12, %ymm12
vaddsubpd %ymm15,%ymm14, %ymm14
vaddsubpd %ymm5 ,%ymm4 , %ymm4
vaddsubpd %ymm7 ,%ymm6 , %ymm6
vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
#else
vaddsubpd %ymm8, %ymm9 ,%ymm9
vaddsubpd %ymm10, %ymm11,%ymm11
vaddsubpd %ymm12, %ymm13,%ymm13
vaddsubpd %ymm14, %ymm15,%ymm15
vaddsubpd %ymm4 , %ymm5 ,%ymm5
vaddsubpd %ymm6 , %ymm7 ,%ymm7
vmovapd %ymm9, %ymm8
vmovapd %ymm11, %ymm10
vmovapd %ymm13, %ymm12
vmovapd %ymm15, %ymm14
vmovapd %ymm5 , %ymm4
vmovapd %ymm7 , %ymm6
// swap high and low 8 bytes
vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
#endif
// multiply with ALPHA_R
vmulpd %ymm8 , %ymm0, %ymm8
vmulpd %ymm10, %ymm0, %ymm10
vmulpd %ymm12, %ymm0, %ymm12
vmulpd %ymm14, %ymm0, %ymm14
vmulpd %ymm4 , %ymm0, %ymm4
vmulpd %ymm6 , %ymm0, %ymm6
// multiply with ALPHA_I
vmulpd %ymm9 , %ymm1, %ymm9
vmulpd %ymm11, %ymm1, %ymm11
vmulpd %ymm13, %ymm1, %ymm13
vmulpd %ymm15, %ymm1, %ymm15
vmulpd %ymm5 , %ymm1, %ymm5
vmulpd %ymm7 , %ymm1, %ymm7
vaddsubpd %ymm9, %ymm8 , %ymm8
vaddsubpd %ymm11,%ymm10, %ymm10
vaddsubpd %ymm13,%ymm12, %ymm12
vaddsubpd %ymm15,%ymm14, %ymm14
vaddsubpd %ymm5 ,%ymm4 , %ymm4
vaddsubpd %ymm7 ,%ymm6 , %ymm6
#ifndef TRMMKERNEL
vaddpd (CO1), %ymm8 , %ymm8
vaddpd 4 * SIZE(CO1), %ymm12, %ymm12
vaddpd (CO1, LDC), %ymm10, %ymm10
vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14
vaddpd (CO1, LDC,2), %ymm4 , %ymm4
vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 , 4 * SIZE(CO1)
vmovups %ymm10 , (CO1, LDC)
vmovups %ymm14 , 4 * SIZE(CO1, LDC)
vmovups %ymm4 , (CO1, LDC, 2)
vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2)
prefetcht0 64(CO1)
prefetcht0 64(CO1, LDC)
.endm
/***************************************************************************************************/
.macro KERNEL2x3_SUB
vmovups (AO), %xmm0
vmovups 2 * SIZE(AO), %xmm1
vmovddup (BO), %xmm2
vmovddup 1 * SIZE(BO), %xmm3
VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 )
VFMADDPD_R( %xmm12,%xmm2,%xmm1 )
VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 )
VFMADDPD_I( %xmm13,%xmm3,%xmm1 )
vmovddup 2 * SIZE(BO), %xmm2
vmovddup 3 * SIZE(BO), %xmm3
VFMADDPD_R( %xmm10,%xmm2,%xmm0 )
VFMADDPD_R( %xmm14,%xmm2,%xmm1 )
VFMADDPD_I( %xmm11,%xmm3,%xmm0 )
VFMADDPD_I( %xmm15,%xmm3,%xmm1 )
vmovddup 4 * SIZE(BO), %xmm2
vmovddup 5 * SIZE(BO), %xmm3
VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 )
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 )
addq $ 6*SIZE, BO
addq $ 4*SIZE, AO
decq %rax
.endm
.macro SAVE2x3
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm13,%xmm12, %xmm12
vaddsubpd %xmm15,%xmm14, %xmm14
vaddsubpd %xmm5, %xmm4 , %xmm4
vaddsubpd %xmm7, %xmm6 , %xmm6
vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
#else
vaddsubpd %xmm8, %xmm9 ,%xmm9
vaddsubpd %xmm10, %xmm11,%xmm11
vaddsubpd %xmm12, %xmm13,%xmm13
vaddsubpd %xmm14, %xmm15,%xmm15
vaddsubpd %xmm4, %xmm5 ,%xmm5
vaddsubpd %xmm6, %xmm7 ,%xmm7
vmovapd %xmm9, %xmm8
vmovapd %xmm11, %xmm10
vmovapd %xmm13, %xmm12
vmovapd %xmm15, %xmm14
vmovapd %xmm5, %xmm4
vmovapd %xmm7, %xmm6
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm10, %xmm0, %xmm10
vmulpd %xmm12, %xmm0, %xmm12
vmulpd %xmm14, %xmm0, %xmm14
vmulpd %xmm4 , %xmm0, %xmm4
vmulpd %xmm6 , %xmm0, %xmm6
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm11, %xmm1, %xmm11
vmulpd %xmm13, %xmm1, %xmm13
vmulpd %xmm15, %xmm1, %xmm15
vmulpd %xmm5 , %xmm1, %xmm5
vmulpd %xmm7 , %xmm1, %xmm7
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm13,%xmm12, %xmm12
vaddsubpd %xmm15,%xmm14, %xmm14
vaddsubpd %xmm5, %xmm4 , %xmm4
vaddsubpd %xmm7, %xmm6 , %xmm6
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
vaddpd (CO1, LDC), %xmm10, %xmm10
vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
vaddpd (CO1, LDC,2), %xmm4 , %xmm4
vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 2 * SIZE(CO1)
vmovups %xmm10 , (CO1, LDC)
vmovups %xmm14 , 2 * SIZE(CO1, LDC)
vmovups %xmm4 , (CO1, LDC,2)
vmovups %xmm6 , 2 * SIZE(CO1, LDC,2)
.endm
/************************************************************************************************/
.macro KERNEL1x3_SUB
vmovups (AO), %xmm0
vmovddup (BO), %xmm2
vmovddup 1 * SIZE(BO), %xmm3
VFMADDPD_R( %xmm8,%xmm2,%xmm0 )
VFMADDPD_I( %xmm9,%xmm3,%xmm0 )
vmovddup 2 * SIZE(BO), %xmm2
vmovddup 3 * SIZE(BO), %xmm3
VFMADDPD_R( %xmm10,%xmm2,%xmm0 )
VFMADDPD_I( %xmm11,%xmm3,%xmm0 )
vmovddup 4 * SIZE(BO), %xmm2
vmovddup 5 * SIZE(BO), %xmm3
VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
addq $ 6*SIZE, BO
addq $ 2*SIZE, AO
decq %rax
.endm
.macro SAVE1x3
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm5, %xmm4 , %xmm4
vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
#else
vaddsubpd %xmm8, %xmm9, %xmm9
vaddsubpd %xmm10,%xmm11, %xmm11
vaddsubpd %xmm4, %xmm5, %xmm5
vmovapd %xmm9, %xmm8
vmovapd %xmm11, %xmm10
vmovapd %xmm5, %xmm4
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm10, %xmm0, %xmm10
vmulpd %xmm4 , %xmm0, %xmm4
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm11, %xmm1, %xmm11
vmulpd %xmm5 , %xmm1, %xmm5
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm5, %xmm4 , %xmm4
#ifndef TRMMKERNEL
vaddpd (CO1) , %xmm8 , %xmm8
vaddpd (CO1, LDC) , %xmm10, %xmm10
vaddpd (CO1, LDC,2) , %xmm4 , %xmm4
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
vmovups %xmm4 , (CO1, LDC,2)
.endm
/***************************************************************************************************/
.macro KERNEL4x2_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4
vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )
VFMADDPD_R( %ymm12,%ymm4,%ymm1 )
vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )
VFMADDPD_I( %ymm13,%ymm5,%ymm1 )
vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPD_R( %ymm10,%ymm6,%ymm0 )
VFMADDPD_R( %ymm14,%ymm6,%ymm1 )
VFMADDPD_I( %ymm11,%ymm7,%ymm0 )
VFMADDPD_I( %ymm15,%ymm7,%ymm1 )
addq $ 4, BI
addq $ 8, %rax
.endm
.macro SAVE4x2
vbroadcastsd ALPHA_R, %ymm0
vbroadcastsd ALPHA_I, %ymm1
// swap high and low 8 bytes
vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %ymm9, %ymm8 , %ymm8
vaddsubpd %ymm11,%ymm10, %ymm10
vaddsubpd %ymm13,%ymm12, %ymm12
vaddsubpd %ymm15,%ymm14, %ymm14
vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
#else
vaddsubpd %ymm8, %ymm9 ,%ymm9
vaddsubpd %ymm10, %ymm11,%ymm11
vaddsubpd %ymm12, %ymm13,%ymm13
vaddsubpd %ymm14, %ymm15,%ymm15
vmovapd %ymm9, %ymm8
vmovapd %ymm11, %ymm10
vmovapd %ymm13, %ymm12
vmovapd %ymm15, %ymm14
// swap high and low 8 bytes
vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
#endif
// multiply with ALPHA_R
vmulpd %ymm8 , %ymm0, %ymm8
vmulpd %ymm10, %ymm0, %ymm10
vmulpd %ymm12, %ymm0, %ymm12
vmulpd %ymm14, %ymm0, %ymm14
// multiply with ALPHA_I
vmulpd %ymm9 , %ymm1, %ymm9
vmulpd %ymm11, %ymm1, %ymm11
vmulpd %ymm13, %ymm1, %ymm13
vmulpd %ymm15, %ymm1, %ymm15
vaddsubpd %ymm9, %ymm8 , %ymm8
vaddsubpd %ymm11,%ymm10, %ymm10
vaddsubpd %ymm13,%ymm12, %ymm12
vaddsubpd %ymm15,%ymm14, %ymm14
#ifndef TRMMKERNEL
vaddpd (CO1), %ymm8 , %ymm8
vaddpd 4 * SIZE(CO1), %ymm12, %ymm12
vaddpd (CO1, LDC), %ymm10, %ymm10
vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 , 4 * SIZE(CO1)
vmovups %ymm10 , (CO1, LDC)
vmovups %ymm14 , 4 * SIZE(CO1, LDC)
prefetcht0 64(CO1)
prefetcht0 64(CO1, LDC)
.endm
/***************************************************************************************************/
.macro KERNEL2x2_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
VFMADDPD_R( %xmm12,%xmm4,%xmm1 )
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
VFMADDPD_I( %xmm13,%xmm5,%xmm1 )
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPD_R( %xmm10,%xmm6,%xmm0 )
VFMADDPD_R( %xmm14,%xmm6,%xmm1 )
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPD_I( %xmm11,%xmm7,%xmm0 )
VFMADDPD_I( %xmm15,%xmm7,%xmm1 )
addq $ 4, BI
addq $ 4, %rax
.endm
.macro SAVE2x2
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm13,%xmm12, %xmm12
vaddsubpd %xmm15,%xmm14, %xmm14
vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
#else
vaddsubpd %xmm8, %xmm9 ,%xmm9
vaddsubpd %xmm10, %xmm11,%xmm11
vaddsubpd %xmm12, %xmm13,%xmm13
vaddsubpd %xmm14, %xmm15,%xmm15
vmovapd %xmm9, %xmm8
vmovapd %xmm11, %xmm10
vmovapd %xmm13, %xmm12
vmovapd %xmm15, %xmm14
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm10, %xmm0, %xmm10
vmulpd %xmm12, %xmm0, %xmm12
vmulpd %xmm14, %xmm0, %xmm14
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm11, %xmm1, %xmm11
vmulpd %xmm13, %xmm1, %xmm13
vmulpd %xmm15, %xmm1, %xmm15
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vaddsubpd %xmm13,%xmm12, %xmm12
vaddsubpd %xmm15,%xmm14, %xmm14
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
vaddpd (CO1, LDC), %xmm10, %xmm10
vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 2 * SIZE(CO1)
vmovups %xmm10 , (CO1, LDC)
vmovups %xmm14 , 2 * SIZE(CO1, LDC)
.endm
/************************************************************************************************/
/************************************************************************************************/
.macro KERNEL1x2_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPD_R( %xmm10,%xmm6,%xmm0 )
VFMADDPD_I( %xmm11,%xmm7,%xmm0 )
addq $ 4, BI
addq $ 2, %rax
.endm
.macro SAVE1x2
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
#else
vaddsubpd %xmm8, %xmm9, %xmm9
vaddsubpd %xmm10,%xmm11, %xmm11
vmovapd %xmm9, %xmm8
vmovapd %xmm11, %xmm10
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm11, %xmm1, %xmm11
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm11,%xmm10, %xmm10
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd (CO1, LDC), %xmm10, %xmm10
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
.endm
/************************************************************************************************/
.macro KERNEL4x1_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4
vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5
VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )
VFMADDPD_R( %ymm12,%ymm4,%ymm1 )
VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )
VFMADDPD_I( %ymm13,%ymm5,%ymm1 )
addq $ 2, BI
addq $ 8, %rax
.endm
.macro SAVE4x1
vbroadcastsd ALPHA_R, %ymm0
vbroadcastsd ALPHA_I, %ymm1
// swap high and low 8 bytes
vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %ymm9, %ymm8 , %ymm8
vaddsubpd %ymm13,%ymm12 , %ymm12
vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
#else
vaddsubpd %ymm8, %ymm9 , %ymm9
vaddsubpd %ymm12,%ymm13, %ymm13
vmovapd %ymm9, %ymm8
vmovapd %ymm13, %ymm12
// swap high and low 8 bytes
vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
#endif
// multiply with ALPHA_R
vmulpd %ymm8 , %ymm0, %ymm8
vmulpd %ymm12, %ymm0, %ymm12
// multiply with ALPHA_I
vmulpd %ymm9 , %ymm1, %ymm9
vmulpd %ymm13, %ymm1, %ymm13
vaddsubpd %ymm9, %ymm8 , %ymm8
vaddsubpd %ymm13, %ymm12, %ymm12
#ifndef TRMMKERNEL
vaddpd (CO1), %ymm8 , %ymm8
vaddpd 4 * SIZE(CO1), %ymm12, %ymm12
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 ,4 * SIZE(CO1)
.endm
/************************************************************************************************/
.macro KERNEL2x1_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPD_R( %xmm12,%xmm4,%xmm1 )
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
VFMADDPD_I( %xmm13,%xmm5,%xmm1 )
addq $ 2, BI
addq $ 4, %rax
.endm
.macro SAVE2x1
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm13,%xmm12 , %xmm12
vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
#else
vaddsubpd %xmm8, %xmm9 , %xmm9
vaddsubpd %xmm12,%xmm13, %xmm13
vmovapd %xmm9, %xmm8
vmovapd %xmm13, %xmm12
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
vmulpd %xmm12, %xmm0, %xmm12
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vmulpd %xmm13, %xmm1, %xmm13
vaddsubpd %xmm9, %xmm8 , %xmm8
vaddsubpd %xmm13, %xmm12, %xmm12
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 2 * SIZE(CO1)
.endm
/************************************************************************************************/
.macro KERNEL1x1_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
addq $ 2, BI
addq $ 2, %rax
.endm
.macro SAVE1x1
vmovddup ALPHA_R, %xmm0
vmovddup ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubpd %xmm9, %xmm8, %xmm8
vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
#else
vaddsubpd %xmm8, %xmm9, %xmm9
vmovapd %xmm9, %xmm8
// swap high and low 64 bytes
vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
#endif
// multiply with ALPHA_R
vmulpd %xmm8 , %xmm0, %xmm8
// multiply with ALPHA_I
vmulpd %xmm9 , %xmm1, %xmm9
vaddsubpd %xmm9 ,%xmm8, %xmm8
#ifndef TRMMKERNEL
vaddpd (CO1), %xmm8 , %xmm8
#endif
vmovups %xmm8 , (CO1)
.endm
/************************************************************************************************/
#if !defined(TRMMKERNEL)
PROLOGUE
PROFCODE
subq $ STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $ 128 + L_BUFFER_SIZE, %rsp
andq $ -4096, %rsp # align stack
STACK_TOUCH
cmpq $ 0, OLD_M
je .L999
cmpq $ 0, OLD_N
je .L999
cmpq $ 0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovsd %xmm0, ALPHA_R
vmovsd %xmm1, ALPHA_I
salq $ ZBASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $ 6, %rdi
divq %rdi // N / 6
movq %rax, Ndiv6 // N / 6
movq %rdx, Nmod6 // N % 6
/************************************************************************************************/
.L6_00_0:
movq Ndiv6, J
cmpq $ 0, J
je .L2_00_0
ALIGN_4
.L6_00_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
salq $2, %rax // 2 * COMPSIZE
leaq (B, %rax,8), BO2
movq BO2, B // next offset of B
movq K, %rax
ALIGN_4
.L6_00_02b:
vmovups (BO1), %xmm0
vmovups 2 * SIZE(BO1), %xmm1
vmovups (BO2), %xmm2
vmovups %xmm0, (BO)
vmovups %xmm1, 2 * SIZE(BO)
vmovups %xmm2, 4 * SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO2
addq $ 6*SIZE,BO
decq %rax
jnz .L6_00_02b
.L6_00_02c:
.L6_00_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
leaq (C, LDC, 1), C // c += 1 * ldc
movq A, AO // aoffset = a
movq M, I
sarq $ 2, I // i = (m >> 2)
je .L6_2_10
ALIGN_4
/******************************************************************************************************************/
.L6_4_11:
leaq BUFFER1, BO // first buffer to BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_4_16
ALIGN_4
.L6_4_12:
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L6_4_16
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L6_4_16
jmp .L6_4_12
ALIGN_4
.L6_4_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_4_19
ALIGN_4
.L6_4_17:
KERNEL4x3_SUB
jnz .L6_4_17
ALIGN_4
.L6_4_19:
SAVE4x3
addq $ 8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L6_4_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
/******************************************************************************************************************/
.L6_2_10:
testq $ 2, M
jz .L6_2_40 // to next 2 lines of N
.L6_2_11:
leaq BUFFER1, BO // first buffer to BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_2_16
ALIGN_4
.L6_2_12:
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L6_2_16
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L6_2_16
jmp .L6_2_12
ALIGN_4
.L6_2_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_2_19
ALIGN_4
.L6_2_17:
KERNEL2x3_SUB
jnz .L6_2_17
ALIGN_4
.L6_2_19:
SAVE2x3
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L6_2_40:
testq $ 1, M
jz .L6_2_60 // to next 2 lines of N
ALIGN_4
.L6_2_41:
leaq BUFFER1, BO // first buffer to BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_2_46
ALIGN_4
.L6_2_42:
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L6_2_46
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L6_2_46
jmp .L6_2_42
ALIGN_4
.L6_2_46:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_2_49
ALIGN_4
.L6_2_47:
KERNEL1x3_SUB
jnz .L6_2_47
ALIGN_4
.L6_2_49:
SAVE1x3
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L6_2_41
ALIGN_4
.L6_2_60:
/************************************************************************************************/
/************************************************************************************************/
.L7_00_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
salq $2, %rax // 2 * COMPSIZE
leaq (B, %rax,8), BO2
movq K, %rax
ALIGN_4
.L7_00_02b:
vmovups 2 * SIZE(BO1), %xmm0
vmovups (BO2), %xmm1
vmovups 2 * SIZE(BO2), %xmm2
vmovups %xmm0, (BO)
vmovups %xmm1, 2 * SIZE(BO)
vmovups %xmm2, 4 * SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO2
addq $ 6*SIZE,BO
decq %rax
jnz .L7_00_02b
.L7_00_02c:
movq BO2, B // next offset of B
.L7_00_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
leaq (C, LDC, 1), C // c += 1 * ldc
movq A, AO // aoffset = a
movq M, I
sarq $ 2, I // i = (m >> 2)
je .L7_2_10
ALIGN_4
/******************************************************************************************************************/
.L7_4_11:
leaq BUFFER1, BO // first buffer to BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_4_16
ALIGN_4
.L7_4_12:
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L7_4_16
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L7_4_16
jmp .L7_4_12
ALIGN_4
.L7_4_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_4_19
ALIGN_4
.L7_4_17:
KERNEL4x3_SUB
jnz .L7_4_17
ALIGN_4
.L7_4_19:
SAVE4x3
addq $ 8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L7_4_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
/******************************************************************************************************************/
.L7_2_10:
testq $ 2, M
jz .L7_2_40 // to next 2 lines of N
.L7_2_11:
leaq BUFFER1, BO // first buffer to BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_2_16
ALIGN_4
.L7_2_12:
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L7_2_16
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L7_2_16
jmp .L7_2_12
ALIGN_4
.L7_2_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_2_19
ALIGN_4
.L7_2_17:
KERNEL2x3_SUB
jnz .L7_2_17
ALIGN_4
.L7_2_19:
SAVE2x3
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L7_2_40:
testq $ 1, M
jz .L7_2_60 // to next 2 lines of N
ALIGN_4
.L7_2_41:
leaq BUFFER1, BO // first buffer to BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_2_46
ALIGN_4
.L7_2_42:
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L7_2_46
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L7_2_46
jmp .L7_2_42
ALIGN_4
.L7_2_46:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_2_49
ALIGN_4
.L7_2_47:
KERNEL1x3_SUB
jnz .L7_2_47
ALIGN_4
.L7_2_49:
SAVE1x3
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L7_2_41
ALIGN_4
.L7_2_60:
decq J // j --
jg .L6_00_01 // next 6 lines of N
/************************************************************************************************/
/************************************************************************************************/
.L2_00_0:
movq Nmod6, J
sarq $1, J // j = j / 2
cmpq $ 0, J
je .L1_2_0
ALIGN_4
.L2_00_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L2_00_02b:
vmovups (BO1), %xmm0
vmovups 2 * SIZE(BO1), %xmm1
vmovups %xmm0, (BO)
vmovups %xmm1, 2 * SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO
decq %rax
jnz .L2_00_02b
.L2_00_02c:
movq BO1, B // next offset of B
.L2_00_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 8 * SIZE, AO
movq M, I
sarq $ 2, I // i = (m >> 2)
je .L2_2_10
ALIGN_4
/******************************************************************************************************************/
.L2_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
je .L2_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
je .L2_4_16
jmp .L2_4_12
ALIGN_4
.L2_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_17:
KERNEL4x2_SUB
jl .L2_4_17
ALIGN_4
.L2_4_19:
SAVE4x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L2_4_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
/******************************************************************************************************************/
.L2_2_10:
testq $ 2, M
jz .L2_2_40 // to next 2 lines of N
.L2_2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_2_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_2_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_2_16
jmp .L2_2_12
ALIGN_4
.L2_2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_2_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_17:
KERNEL2x2_SUB
jl .L2_2_17
ALIGN_4
.L2_2_19:
SAVE2x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_2_40:
testq $ 1, M
jz .L2_2_60 // to next 2 lines of N
ALIGN_4
.L2_2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_2_46
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_2_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_2_46
jmp .L2_2_42
ALIGN_4
.L2_2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_2_49
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_47:
KERNEL1x2_SUB
jl .L2_2_47
ALIGN_4
.L2_2_49:
SAVE1x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L2_2_41
ALIGN_4
.L2_2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $ 2, KK
#endif
decq J // j --
jg .L2_00_01 // next 2 lines of N
.L1_2_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $ 1, J // j % 2
je .L999
ALIGN_4
.L1_00_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_00_02b:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $ 2*SIZE,BO1
addq $ 2*SIZE,BO
decq %rax
jnz .L1_00_02b
.L1_00_02c:
movq BO1, B // next offset of B
.L1_00_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 8 * SIZE, AO
movq M, I
sarq $ 2, I // i = (m >> 2)
je .L1_2_10
ALIGN_4
/*******************************************************************************************************/
.L1_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_12:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
jmp .L1_4_12
ALIGN_4
.L1_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_17:
KERNEL4x1_SUB
jl .L1_4_17
ALIGN_4
.L1_4_19:
SAVE4x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L1_4_11
ALIGN_4
/*******************************************************************************************************/
.L1_2_10:
testq $ 2, M
jz .L1_2_40
.L1_2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_2_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_2_16
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_2_16
jmp .L1_2_12
ALIGN_4
.L1_2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_2_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_17:
KERNEL2x1_SUB
jl .L1_2_17
ALIGN_4
.L1_2_19:
SAVE2x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_2_40:
testq $ 1, M
jz .L999
ALIGN_4
.L1_2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_2_46
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_2_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_2_46
jmp .L1_2_42
ALIGN_4
.L1_2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_2_49
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_47:
KERNEL1x1_SUB
jl .L1_2_47
ALIGN_4
.L1_2_49:
SAVE1x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L1_2_41
ALIGN_4
.L999:
vzeroupper
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
ret
EPILOGUE
#else
/************************************************************************************************
TRMM Kernel
************************************************************************************************/
PROLOGUE
PROFCODE
subq $ STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $ 128 + L_BUFFER_SIZE, %rsp
andq $ -4096, %rsp # align stack
STACK_TOUCH
cmpq $ 0, OLD_M
je .L999
cmpq $ 0, OLD_N
je .L999
cmpq $ 0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovsd %xmm0, ALPHA_R
vmovsd %xmm1, ALPHA_I
salq $ ZBASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $ 2, %rdi
divq %rdi // N / 2
movq %rax, Ndiv6 // N / 2
movq %rdx, Nmod6 // N % 2
#ifdef TRMMKERNEL
vmovsd %xmm12, OFFSET
vmovsd %xmm12, KK
#ifndef LEFT
negq KK
#endif
#endif
.L2_00_0:
movq Ndiv6, J
cmpq $ 0, J
je .L1_2_0
ALIGN_4
.L2_00_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L2_00_02b:
vmovups (BO1), %xmm0
vmovups 2 * SIZE(BO1), %xmm1
vmovups %xmm0, (BO)
vmovups %xmm1, 2 * SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO
decq %rax
jnz .L2_00_02b
.L2_00_02c:
movq BO1, B // next offset of B
.L2_00_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 8 * SIZE, AO
movq M, I
sarq $ 2, I // i = (m >> 2)
je .L2_2_10
ALIGN_4
/******************************************************************************************************************/
.L2_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
je .L2_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI ,SIZE)
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
je .L2_4_16
jmp .L2_4_12
ALIGN_4
.L2_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_17:
KERNEL4x2_SUB
jl .L2_4_17
ALIGN_4
.L2_4_19:
SAVE4x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L2_4_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
/******************************************************************************************************************/
.L2_2_10:
testq $ 2, M
jz .L2_2_40 // to next 2 lines of N
.L2_2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_2_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_2_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_2_16
jmp .L2_2_12
ALIGN_4
.L2_2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_2_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_17:
KERNEL2x2_SUB
jl .L2_2_17
ALIGN_4
.L2_2_19:
SAVE2x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_2_40:
testq $ 1, M
jz .L2_2_60 // to next 2 lines of N
ALIGN_4
.L2_2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_2_46
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_2_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_2_46
jmp .L2_2_42
ALIGN_4
.L2_2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_2_49
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_2_47:
KERNEL1x2_SUB
jl .L2_2_47
ALIGN_4
.L2_2_49:
SAVE1x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L2_2_41
ALIGN_4
.L2_2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $ 2, KK
#endif
decq J // j --
jg .L2_00_01 // next 2 lines of N
.L1_2_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $ 1, J // j % 2
je .L999
ALIGN_4
.L1_00_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_00_02b:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $ 2*SIZE,BO1
addq $ 2*SIZE,BO
decq %rax
jnz .L1_00_02b
.L1_00_02c:
movq BO1, B // next offset of B
.L1_00_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 8 * SIZE, AO
movq M, I
sarq $ 2, I // i = (m >> 2)
je .L1_2_10
ALIGN_4
/*******************************************************************************************************/
.L1_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_12:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
jmp .L1_4_12
ALIGN_4
.L1_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_17:
KERNEL4x1_SUB
jl .L1_4_17
ALIGN_4
.L1_4_19:
SAVE4x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
decq I # i --
jg .L1_4_11
ALIGN_4
/*******************************************************************************************************/
.L1_2_10:
testq $ 2, M
jz .L1_2_40
.L1_2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_2_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_2_16
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_2_16
jmp .L1_2_12
ALIGN_4
.L1_2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_2_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_17:
KERNEL2x1_SUB
jl .L1_2_17
ALIGN_4
.L1_2_19:
SAVE2x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_2_40:
testq $ 1, M
jz .L999
ALIGN_4
.L1_2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_2_46
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_2_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_2_46
jmp .L1_2_42
ALIGN_4
.L1_2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_2_49
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_2_47:
KERNEL1x1_SUB
jl .L1_2_47
ALIGN_4
.L1_2_49:
SAVE1x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L1_2_41
ALIGN_4
.L999:
vzeroupper
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
ret
EPILOGUE
#endif