OpenBLAS/kernel/x86_64/cgemm_kernel_8x2_haswell.S

4928 lines
103 KiB
ArmAsm

/*********************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************
* 2014/07/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
* 2013/10/28 Saar
* Parameter:
* CGEMM_DEFAULT_UNROLL_N 2
* CGEMM_DEFAULT_UNROLL_M 8
* CGEMM_DEFAULT_P 384
* CGEMM_DEFAULT_Q 192
* A_PR1 512
* B_PR1 512
*
* 2014/07/29 Saar
* Performance at 6912x6912x6912:
* 1 thread: 107 GFLOPS (SANDYBRIDGE: 60) (MKL: 86)
* 2 threads: 208 GFLOPS (SANDYBRIDGE: 114) (MKL: 155)
* 3 threads: 289 GFLOPS (SANDYBRIDGE: 162) (MKL: 222)
* 4 threads: 377 GFLOPS (SANDYBRIDGE: 223) (MKL: 279)
*
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
#define J %r14
#define OLD_K %rdx
#define A %rcx
#define B %r8
#define C %r9
#define LDC %r10
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define K %r12
#define BI %rbp
#define SP %rbx
#define BO1 %rdi
#define BO2 %rbp
#ifndef WINDOWS_ABI
#define STACKSIZE 96
#else
#define STACKSIZE 320
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_B 56 + STACKSIZE(%rsp)
#define OLD_C 64 + STACKSIZE(%rsp)
#define OLD_LDC 72 + STACKSIZE(%rsp)
#define OLD_OFFSET 80 + STACKSIZE(%rsp)
#endif
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
#define N 40(%rsp)
#define ALPHA_R 48(%rsp)
#define ALPHA_I 56(%rsp)
#define OFFSET 64(%rsp)
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
#define STACK_TOUCH \
movl $ 0, 4096 * 4(%rsp);\
movl $ 0, 4096 * 3(%rsp);\
movl $ 0, 4096 * 2(%rsp);\
movl $ 0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 12288
#define STACK_TOUCH \
movl $ 0, 4096 * 3(%rsp);\
movl $ 0, 4096 * 2(%rsp);\
movl $ 0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 8192
#define STACK_TOUCH \
movl $ 0, 4096 * 2(%rsp);\
movl $ 0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 4096
#define STACK_TOUCH \
movl $ 0, 4096 * 1(%rsp);
#else
#define STACK_TOUCH
#endif
#else
#define STACK_TOUCH
#endif
#if defined(BULLDOZER)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
#else
#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
#endif
#else
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
#else
#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
#endif
#endif
#define A_PR1 512
#define B_PR1 512
/***************************************************************************************************************************/
.macro KERNEL8x3_SUB
vmovups -16 * SIZE(AO), %ymm0
vmovups -8 * SIZE(AO), %ymm1
vbroadcastss -8 * SIZE(BO), %ymm2
vbroadcastss -7 * SIZE(BO), %ymm3
prefetcht0 A_PR1(AO)
VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 )
VFMADDPS_R( %ymm12,%ymm2,%ymm1 )
VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 )
VFMADDPS_I( %ymm13,%ymm3,%ymm1 )
vbroadcastss -6 * SIZE(BO), %ymm2
vbroadcastss -5 * SIZE(BO), %ymm3
VFMADDPS_R( %ymm10,%ymm2,%ymm0 )
VFMADDPS_R( %ymm14,%ymm2,%ymm1 )
VFMADDPS_I( %ymm11,%ymm3,%ymm0 )
VFMADDPS_I( %ymm15,%ymm3,%ymm1 )
vbroadcastss -4 * SIZE(BO), %ymm2
vbroadcastss -3 * SIZE(BO), %ymm3
VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
VFMADDPS_R( %ymm6 ,%ymm2,%ymm1 )
VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 )
addq $ 6*SIZE, BO
addq $ 16*SIZE, AO
decq %rax
.endm
.macro SAVE8x3
vbroadcastss ALPHA_R, %ymm0
vbroadcastss ALPHA_I, %ymm1
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm11,%ymm10, %ymm10
vaddsubps %ymm13,%ymm12, %ymm12
vaddsubps %ymm15,%ymm14, %ymm14
vaddsubps %ymm5, %ymm4 , %ymm4
vaddsubps %ymm7, %ymm6 , %ymm6
vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7
#else
vaddsubps %ymm8, %ymm9 ,%ymm9
vaddsubps %ymm10, %ymm11,%ymm11
vaddsubps %ymm12, %ymm13,%ymm13
vaddsubps %ymm14, %ymm15,%ymm15
vaddsubps %ymm4, %ymm5 ,%ymm5
vaddsubps %ymm6, %ymm7 ,%ymm7
vmovaps %ymm9, %ymm8
vmovaps %ymm11, %ymm10
vmovaps %ymm13, %ymm12
vmovaps %ymm15, %ymm14
vmovaps %ymm5, %ymm4
vmovaps %ymm7, %ymm6
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
#endif
// multiply with ALPHA_R
vmulps %ymm8 , %ymm0, %ymm8
vmulps %ymm10, %ymm0, %ymm10
vmulps %ymm12, %ymm0, %ymm12
vmulps %ymm14, %ymm0, %ymm14
vmulps %ymm4 , %ymm0, %ymm4
vmulps %ymm6 , %ymm0, %ymm6
// multiply with ALPHA_I
vmulps %ymm9 , %ymm1, %ymm9
vmulps %ymm11, %ymm1, %ymm11
vmulps %ymm13, %ymm1, %ymm13
vmulps %ymm15, %ymm1, %ymm15
vmulps %ymm5 , %ymm1, %ymm5
vmulps %ymm7 , %ymm1, %ymm7
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm11,%ymm10, %ymm10
vaddsubps %ymm13,%ymm12, %ymm12
vaddsubps %ymm15,%ymm14, %ymm14
vaddsubps %ymm5, %ymm4 , %ymm4
vaddsubps %ymm7, %ymm6 , %ymm6
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm8 , %ymm8
vaddps 8 * SIZE(CO1), %ymm12, %ymm12
vaddps (CO1, LDC), %ymm10, %ymm10
vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14
vaddps (CO1, LDC,2), %ymm4, %ymm4
vaddps 8 * SIZE(CO1, LDC,2), %ymm6, %ymm6
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 , 8 * SIZE(CO1)
vmovups %ymm10 , (CO1, LDC)
vmovups %ymm14 , 8 * SIZE(CO1, LDC)
vmovups %ymm4 , (CO1, LDC,2)
vmovups %ymm6 , 8 * SIZE(CO1, LDC,2)
.endm
/***************************************************************************************************************************/
.macro KERNEL4x3_SUB
vmovups -16 * SIZE(AO), %ymm0
vbroadcastss -8 * SIZE(BO), %ymm2
vbroadcastss -7 * SIZE(BO), %ymm3
VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 )
VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 )
vbroadcastss -6 * SIZE(BO), %ymm2
vbroadcastss -5 * SIZE(BO), %ymm3
VFMADDPS_R( %ymm12,%ymm2,%ymm0 )
VFMADDPS_I( %ymm13,%ymm3,%ymm0 )
vbroadcastss -4 * SIZE(BO), %ymm2
vbroadcastss -3 * SIZE(BO), %ymm3
VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
addq $ 6*SIZE, BO
addq $ 8*SIZE, AO
decq %rax
.endm
.macro SAVE4x3
vbroadcastss ALPHA_R, %ymm0
vbroadcastss ALPHA_I, %ymm1
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm13,%ymm12, %ymm12
vaddsubps %ymm5, %ymm4 , %ymm4
vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
#else
vaddsubps %ymm8, %ymm9 ,%ymm9
vaddsubps %ymm12, %ymm13,%ymm13
vaddsubps %ymm4, %ymm5 ,%ymm5
vmovaps %ymm9, %ymm8
vmovaps %ymm13, %ymm12
vmovaps %ymm5, %ymm4
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
#endif
// multiply with ALPHA_R
vmulps %ymm8 , %ymm0, %ymm8
vmulps %ymm12, %ymm0, %ymm12
vmulps %ymm4 , %ymm0, %ymm4
// multiply with ALPHA_I
vmulps %ymm9 , %ymm1, %ymm9
vmulps %ymm13, %ymm1, %ymm13
vmulps %ymm5 , %ymm1, %ymm5
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm13,%ymm12, %ymm12
vaddsubps %ymm5, %ymm4 , %ymm4
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm8 , %ymm8
vaddps (CO1, LDC), %ymm12, %ymm12
vaddps (CO1, LDC,2), %ymm4, %ymm4
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 , (CO1, LDC)
vmovups %ymm4 , (CO1, LDC,2)
.endm
/***************************************************************************************************************************/
.macro KERNEL2x3_SUB
vmovups -16 * SIZE(AO), %xmm0
vbroadcastss -8 * SIZE(BO), %xmm2
vbroadcastss -7 * SIZE(BO), %xmm3
VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 )
vbroadcastss -6 * SIZE(BO), %xmm2
vbroadcastss -5 * SIZE(BO), %xmm3
VFMADDPS_R( %xmm12,%xmm2,%xmm0 )
VFMADDPS_I( %xmm13,%xmm3,%xmm0 )
vbroadcastss -4 * SIZE(BO), %xmm2
vbroadcastss -3 * SIZE(BO), %xmm3
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
addq $ 6*SIZE, BO
addq $ 4*SIZE, AO
decq %rax
.endm
.macro SAVE2x3
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm5, %xmm4 , %xmm4
vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm12, %xmm13,%xmm13
vaddsubps %xmm4, %xmm5 ,%xmm5
vmovaps %xmm9, %xmm8
vmovaps %xmm13, %xmm12
vmovaps %xmm5, %xmm4
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm12, %xmm0, %xmm12
vmulps %xmm4 , %xmm0, %xmm4
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm13, %xmm1, %xmm13
vmulps %xmm5 , %xmm1, %xmm5
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm5, %xmm4 , %xmm4
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm8 , %xmm8
vaddps (CO1, LDC), %xmm12, %xmm12
vaddps (CO1, LDC,2), %xmm4, %xmm4
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , (CO1, LDC)
vmovups %xmm4 , (CO1, LDC,2)
.endm
/***************************************************************************************************************************/
.macro KERNEL1x3_SUB
vmovsd -16 * SIZE(AO), %xmm0
vbroadcastss -8 * SIZE(BO), %xmm2
vbroadcastss -7 * SIZE(BO), %xmm3
VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 )
vbroadcastss -6 * SIZE(BO), %xmm2
vbroadcastss -5 * SIZE(BO), %xmm3
VFMADDPS_R( %xmm12,%xmm2,%xmm0 )
VFMADDPS_I( %xmm13,%xmm3,%xmm0 )
vbroadcastss -4 * SIZE(BO), %xmm2
vbroadcastss -3 * SIZE(BO), %xmm3
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
addq $ 6*SIZE, BO
addq $ 2*SIZE, AO
decq %rax
.endm
.macro SAVE1x3
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm5, %xmm4 , %xmm4
vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm12, %xmm13,%xmm13
vaddsubps %xmm4, %xmm5 ,%xmm5
vmovaps %xmm9, %xmm8
vmovaps %xmm13, %xmm12
vmovaps %xmm5, %xmm4
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm12, %xmm0, %xmm12
vmulps %xmm4 , %xmm0, %xmm4
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm13, %xmm1, %xmm13
vmulps %xmm5 , %xmm1, %xmm5
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm5, %xmm4 , %xmm4
#if !defined(TRMMKERNEL)
vmovsd (CO1) , %xmm9
vmovsd (CO1,LDC) , %xmm13
vmovsd (CO1,LDC,2), %xmm5
vaddps %xmm9 , %xmm8 , %xmm8
vaddps %xmm13, %xmm12, %xmm12
vaddps %xmm5 , %xmm4, %xmm4
#endif
vmovsd %xmm8 , (CO1)
vmovsd %xmm12 , (CO1, LDC)
vmovsd %xmm4 , (CO1, LDC,2)
.endm
/***************************************************************************************************************************/
.macro KERNEL8x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_R( %ymm10,%ymm6,%ymm0 )
VFMADDPS_R( %ymm14,%ymm6,%ymm1 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_I( %ymm11,%ymm7,%ymm0 )
VFMADDPS_I( %ymm15,%ymm7,%ymm1 )
addq $ 4 , BI
addq $ 16, %rax
.endm
.macro SAVE8x2
vbroadcastss ALPHA_R, %ymm0
vbroadcastss ALPHA_I, %ymm1
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm11,%ymm10, %ymm10
vaddsubps %ymm13,%ymm12, %ymm12
vaddsubps %ymm15,%ymm14, %ymm14
vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
#else
vaddsubps %ymm8, %ymm9 ,%ymm9
vaddsubps %ymm10, %ymm11,%ymm11
vaddsubps %ymm12, %ymm13,%ymm13
vaddsubps %ymm14, %ymm15,%ymm15
vmovaps %ymm9, %ymm8
vmovaps %ymm11, %ymm10
vmovaps %ymm13, %ymm12
vmovaps %ymm15, %ymm14
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
#endif
// multiply with ALPHA_R
vmulps %ymm8 , %ymm0, %ymm8
vmulps %ymm10, %ymm0, %ymm10
vmulps %ymm12, %ymm0, %ymm12
vmulps %ymm14, %ymm0, %ymm14
// multiply with ALPHA_I
vmulps %ymm9 , %ymm1, %ymm9
vmulps %ymm11, %ymm1, %ymm11
vmulps %ymm13, %ymm1, %ymm13
vmulps %ymm15, %ymm1, %ymm15
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm11,%ymm10, %ymm10
vaddsubps %ymm13,%ymm12, %ymm12
vaddsubps %ymm15,%ymm14, %ymm14
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm8 , %ymm8
vaddps 8 * SIZE(CO1), %ymm12, %ymm12
vaddps (CO1, LDC), %ymm10, %ymm10
vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 , 8 * SIZE(CO1)
vmovups %ymm10 , (CO1, LDC)
vmovups %ymm14 , 8 * SIZE(CO1, LDC)
prefetcht0 64(CO1)
prefetcht0 64(CO1, LDC)
.endm
/***************************************************************************************************************************/
.macro KERNEL4x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
VFMADDPS_R( %xmm14,%xmm6,%xmm1 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
VFMADDPS_I( %xmm15,%xmm7,%xmm1 )
addq $ 4, BI
addq $ 8, %rax
.endm
.macro SAVE4x2
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm15,%xmm14, %xmm14
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vaddsubps %xmm12, %xmm13,%xmm13
vaddsubps %xmm14, %xmm15,%xmm15
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
vmovaps %xmm13, %xmm12
vmovaps %xmm15, %xmm14
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
vmulps %xmm12, %xmm0, %xmm12
vmulps %xmm14, %xmm0, %xmm14
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vmulps %xmm13, %xmm1, %xmm13
vmulps %xmm15, %xmm1, %xmm15
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vaddsubps %xmm13,%xmm12, %xmm12
vaddsubps %xmm15,%xmm14, %xmm14
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm8 , %xmm8
vaddps 4 * SIZE(CO1), %xmm12, %xmm12
vaddps (CO1, LDC), %xmm10, %xmm10
vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 4 * SIZE(CO1)
vmovups %xmm10 , (CO1, LDC)
vmovups %xmm14 , 4 * SIZE(CO1, LDC)
.endm
/************************************************************************************************/
.macro KERNEL2x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
addq $ 4, BI
addq $ 4, %rax
.endm
.macro SAVE2x2
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 4 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
// swap high and low 4 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm8 , %xmm8
vaddps (CO1, LDC), %xmm10, %xmm10
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
.endm
/************************************************************************************************/
.macro KERNEL1x2_SUB
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
addq $ 4, BI
addq $ 2, %rax
.endm
.macro SAVE1x2
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
#if !defined(TRMMKERNEL)
vmovsd (CO1), %xmm14
vaddps %xmm14, %xmm8 , %xmm8
vmovsd (CO1, LDC), %xmm15
vaddps %xmm15, %xmm10, %xmm10
#endif
vmovsd %xmm8 , (CO1)
vmovsd %xmm10 , (CO1, LDC)
.endm
/************************************************************************************************/
.macro KERNEL8x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
addq $ 2 , BI
addq $ 16, %rax
.endm
.macro SAVE8x1
vbroadcastss ALPHA_R, %ymm0
vbroadcastss ALPHA_I, %ymm1
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm13,%ymm12, %ymm12
vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
#else
vaddsubps %ymm8, %ymm9 ,%ymm9
vaddsubps %ymm12, %ymm13,%ymm13
vmovaps %ymm9, %ymm8
vmovaps %ymm13, %ymm12
// swap high and low 64 bytes
vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
#endif
// multiply with ALPHA_R
vmulps %ymm8 , %ymm0, %ymm8
vmulps %ymm12, %ymm0, %ymm12
// multiply with ALPHA_I
vmulps %ymm9 , %ymm1, %ymm9
vmulps %ymm13, %ymm1, %ymm13
vaddsubps %ymm9, %ymm8 , %ymm8
vaddsubps %ymm13,%ymm12, %ymm12
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm8 , %ymm8
vaddps 8 * SIZE(CO1), %ymm12, %ymm12
#endif
vmovups %ymm8 , (CO1)
vmovups %ymm12 , 8 * SIZE(CO1)
.endm
/************************************************************************************************/
.macro KERNEL4x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
addq $ 2, BI
addq $ 8, %rax
.endm
.macro SAVE4x1
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 4 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm12, %xmm13,%xmm13
vmovaps %xmm9, %xmm8
vmovaps %xmm13, %xmm12
// swap high and low 4 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm12, %xmm0, %xmm12
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm13, %xmm1, %xmm13
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm13,%xmm12, %xmm12
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
vaddps 4 * SIZE(CO1), %xmm12, %xmm12
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm12 , 4 * SIZE(CO1)
.endm
/************************************************************************************************/
.macro KERNEL2x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
addq $ 2, BI
addq $ 4, %rax
.endm
.macro SAVE2x1
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vmovaps %xmm9, %xmm8
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vaddsubps %xmm9, %xmm8 , %xmm8
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm8 , %xmm8
#endif
vmovups %xmm8 , (CO1)
.endm
/************************************************************************************************/
.macro KERNEL1x1_SUB
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
addq $ 2, BI
addq $ 2, %rax
.endm
.macro SAVE1x1
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vmovaps %xmm9, %xmm8
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vaddsubps %xmm9, %xmm8 , %xmm8
#if !defined(TRMMKERNEL)
vmovsd (CO1), %xmm14
vaddps %xmm14, %xmm8 , %xmm8
#endif
vmovsd %xmm8 , (CO1)
.endm
#if !defined(TRMMKERNEL)
PROLOGUE
PROFCODE
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#endif
movq %rsp, SP # save old stack
subq $ 128 + L_BUFFER_SIZE, %rsp
andq $ -4096, %rsp # align stack
STACK_TOUCH
cmpq $ 0, OLD_M
je .L999
cmpq $ 0, OLD_N
je .L999
cmpq $ 0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovss %xmm0, ALPHA_R
vmovss %xmm1, ALPHA_I
salq $ ZBASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $ 6, %rdi
divq %rdi // N / 6
movq %rax, Ndiv6 // N / 6
movq %rdx, Nmod6 // N % 6
/************************************************************************************************/
.L6_0:
movq Ndiv6, J
cmpq $ 0, J
je .L2_00
ALIGN_4
.L6_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
salq $2, %rax // 2 * COMPSIZE
leaq (B, %rax,4), BO2
movq BO2, B // next offset of B
movq K, %rax
ALIGN_4
.L6_02b:
vmovups (BO1), %xmm0
vmovsd (BO2), %xmm1
vmovups %xmm0, (BO)
vmovsd %xmm1, 4*SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO2
addq $ 6*SIZE,BO
decq %rax
jnz .L6_02b
.L6_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
leaq (C, LDC, 1), C // c += 1 * ldc
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $ 3, I // i = (m >> 3)
je .L6_4_10
ALIGN_4
/**********************************************************************************************************/
.L6_8_11:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_8_16
ALIGN_4
.L6_8_12:
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
je .L6_8_16
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
je .L6_8_16
jmp .L6_8_12
ALIGN_4
.L6_8_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_8_19
ALIGN_4
.L6_8_17:
KERNEL8x3_SUB
jnz .L6_8_17
ALIGN_4
.L6_8_19:
SAVE8x3
addq $ 16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L6_8_11
ALIGN_4
/**********************************************************************************************************/
.L6_4_10:
testq $ 7, M
jz .L6_4_60 // to next 2 lines of N
testq $ 4, M
jz .L6_4_20
ALIGN_4
.L6_4_11:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_4_16
ALIGN_4
.L6_4_12:
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L6_4_16
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L6_4_16
jmp .L6_4_12
ALIGN_4
.L6_4_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_4_19
ALIGN_4
.L6_4_17:
KERNEL4x3_SUB
jnz .L6_4_17
ALIGN_4
.L6_4_19:
SAVE4x3
addq $ 8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L6_4_20:
testq $ 2, M
jz .L6_4_40
ALIGN_4
.L6_4_21:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_4_26
ALIGN_4
.L6_4_22:
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L6_4_26
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L6_4_26
jmp .L6_4_22
ALIGN_4
.L6_4_26:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_4_29
ALIGN_4
.L6_4_27:
KERNEL2x3_SUB
jnz .L6_4_27
ALIGN_4
.L6_4_29:
SAVE2x3
addq $ 4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L6_4_21
ALIGN_4
/**************************************************************************/
.L6_4_40:
testq $ 1, M
jz .L6_4_60 // to next 2 lines of N
ALIGN_4
.L6_4_41:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L6_4_46
ALIGN_4
.L6_4_42:
prefetcht0 A_PR1(AO)
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L6_4_46
prefetcht0 A_PR1(AO)
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L6_4_46
jmp .L6_4_42
ALIGN_4
.L6_4_46:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L6_4_49
ALIGN_4
.L6_4_47:
KERNEL1x3_SUB
jnz .L6_4_47
ALIGN_4
.L6_4_49:
SAVE1x3
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L6_4_41
ALIGN_4
.L6_4_60:
/*******************************************************************************************/
.L7_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
salq $2, %rax // 2 * COMPSIZE
leaq (B, %rax,4), BO2
movq K, %rax
ALIGN_4
.L7_02b:
vmovsd 2*SIZE(BO1), %xmm0
vmovups (BO2), %xmm1
vmovsd %xmm0, (BO)
vmovups %xmm1, 2*SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO2
addq $ 6*SIZE,BO
decq %rax
jnz .L7_02b
movq BO2, B // next offset of B
.L7_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
leaq (C, LDC, 1), C // c += 1 * ldc
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $ 3, I // i = (m >> 3)
je .L7_4_10
ALIGN_4
/**********************************************************************************************************/
.L7_8_11:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_8_16
ALIGN_4
.L7_8_12:
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
je .L7_8_16
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
KERNEL8x3_SUB
je .L7_8_16
jmp .L7_8_12
ALIGN_4
.L7_8_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_8_19
ALIGN_4
.L7_8_17:
KERNEL8x3_SUB
jnz .L7_8_17
ALIGN_4
.L7_8_19:
SAVE8x3
addq $ 16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L7_8_11
ALIGN_4
/**********************************************************************************************************/
.L7_4_10:
testq $ 7, M
jz .L7_4_60 // to next 2 lines of N
testq $ 4, M
jz .L7_4_20
ALIGN_4
.L7_4_11:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_4_16
ALIGN_4
.L7_4_12:
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L7_4_16
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
prefetcht0 A_PR1(AO)
KERNEL4x3_SUB
KERNEL4x3_SUB
je .L7_4_16
jmp .L7_4_12
ALIGN_4
.L7_4_16:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_4_19
ALIGN_4
.L7_4_17:
KERNEL4x3_SUB
jnz .L7_4_17
ALIGN_4
.L7_4_19:
SAVE4x3
addq $ 8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L7_4_20:
testq $ 2, M
jz .L7_4_40
ALIGN_4
.L7_4_21:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_4_26
ALIGN_4
.L7_4_22:
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L7_4_26
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
prefetcht0 A_PR1(AO)
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
KERNEL2x3_SUB
je .L7_4_26
jmp .L7_4_22
ALIGN_4
.L7_4_26:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_4_29
ALIGN_4
.L7_4_27:
KERNEL2x3_SUB
jnz .L7_4_27
ALIGN_4
.L7_4_29:
SAVE2x3
addq $ 4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L7_4_21
ALIGN_4
/**************************************************************************/
.L7_4_40:
testq $ 1, M
jz .L7_4_60 // to next 2 lines of N
ALIGN_4
.L7_4_41:
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
vzeroall
movq K, %rax
andq $ -8, %rax // K = K - ( K % 8 )
je .L7_4_46
ALIGN_4
.L7_4_42:
prefetcht0 A_PR1(AO)
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L7_4_46
prefetcht0 A_PR1(AO)
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
KERNEL1x3_SUB
je .L7_4_46
jmp .L7_4_42
ALIGN_4
.L7_4_46:
movq K, %rax
andq $ 7, %rax # if (k & 1)
je .L7_4_49
ALIGN_4
.L7_4_47:
KERNEL1x3_SUB
jnz .L7_4_47
ALIGN_4
.L7_4_49:
SAVE1x3
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L7_4_41
ALIGN_4
.L7_4_60:
decq J // j --
jg .L6_01 // next 6 lines of N
/************************************************************************************************/
.L2_00:
movq Nmod6, J
sarq $1, J // j = j / 2
cmpq $ 0, J
je .L1_0
ALIGN_4
.L2_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L2_02b:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO
decq %rax
jnz .L2_02b
.L2_02c:
movq BO1, B // next offset of B
.L2_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $ 3, I // i = (m >> 3)
je .L2_4_10
ALIGN_4
/**********************************************************************************************************/
.L2_8_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 8, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_8_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_8_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
je .L2_8_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
je .L2_8_16
jmp .L2_8_12
ALIGN_4
.L2_8_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_8_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_8_17:
KERNEL8x2_SUB
jl .L2_8_17
ALIGN_4
.L2_8_19:
SAVE8x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 8, KK
#endif
addq $ 16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L2_8_11
ALIGN_4
/**********************************************************************************************************/
.L2_4_10:
testq $ 7, M
jz .L2_4_60 // to next 2 lines of N
testq $ 4, M
jz .L2_4_20
ALIGN_4
.L2_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_4_16
jmp .L2_4_12
ALIGN_4
.L2_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_17:
KERNEL4x2_SUB
jl .L2_4_17
ALIGN_4
.L2_4_19:
SAVE4x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_4_20:
testq $ 2, M
jz .L2_4_40
ALIGN_4
.L2_4_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_26
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_22:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_4_26
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_4_26
jmp .L2_4_22
ALIGN_4
.L2_4_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_29
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_27:
KERNEL2x2_SUB
jl .L2_4_27
ALIGN_4
.L2_4_29:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
vaddps (CO1, LDC), %xmm10, %xmm10
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L2_4_21
ALIGN_4
/**************************************************************************/
.L2_4_40:
testq $ 1, M
jz .L2_4_60 // to next 2 lines of N
ALIGN_4
.L2_4_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_46
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_4_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_4_46
jmp .L2_4_42
ALIGN_4
.L2_4_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_49
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_47:
KERNEL1x2_SUB
jl .L2_4_47
ALIGN_4
.L2_4_49:
SAVE1x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L2_4_41
ALIGN_4
.L2_4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $ 2, KK
#endif
decq J // j --
jg .L2_01 // next 2 lines of N
.L1_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $ 1, J // j % 2
je .L999
ALIGN_4
.L1_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_02b:
vmovsd (BO1), %xmm0
vmovsd %xmm0, (BO)
addq $ 2*SIZE,BO1
addq $ 2*SIZE,BO
decq %rax
jnz .L1_02b
.L1_02c:
movq BO1, B // next offset of B
.L1_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $ 3, I // i = (m >> 3)
je .L1_4_10
ALIGN_4
/**************************************************************************************************/
.L1_8_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 8, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_8_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_8_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
je .L1_8_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
je .L1_8_16
jmp .L1_8_12
ALIGN_4
.L1_8_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_8_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_8_17:
KERNEL8x1_SUB
jl .L1_8_17
ALIGN_4
.L1_8_19:
SAVE8x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 8, KK
#endif
addq $ 16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L1_8_11
ALIGN_4
/**************************************************************************************************/
.L1_4_10:
testq $ 7, M
jz .L999
testq $ 4, M
jz .L1_4_20
.L1_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
jmp .L1_4_12
ALIGN_4
.L1_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_17:
KERNEL4x1_SUB
jl .L1_4_17
ALIGN_4
.L1_4_19:
SAVE4x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_4_20:
testq $ 2, M
jz .L1_4_40
ALIGN_4
.L1_4_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_26
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_22:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_4_26
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_4_26
jmp .L1_4_22
ALIGN_4
.L1_4_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_29
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_27:
KERNEL2x1_SUB
jl .L1_4_27
ALIGN_4
.L1_4_29:
SAVE2x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************/
.L1_4_40:
testq $ 1, M
jz .L999 // to next 2 lines of N
ALIGN_4
.L1_4_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_46
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_4_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_4_46
jmp .L1_4_42
ALIGN_4
.L1_4_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_49
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_47:
KERNEL1x1_SUB
jl .L1_4_47
ALIGN_4
.L1_4_49:
SAVE1x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L999:
vzeroupper
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
ret
EPILOGUE
#else
/************************************************************************************************/
PROLOGUE
PROFCODE
subq $ STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $ 128 + L_BUFFER_SIZE, %rsp
andq $ -4096, %rsp # align stack
STACK_TOUCH
cmpq $ 0, OLD_M
je .L999
cmpq $ 0, OLD_N
je .L999
cmpq $ 0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovss %xmm0, ALPHA_R
vmovss %xmm1, ALPHA_I
salq $ ZBASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $ 2, %rdi
divq %rdi // N / 2
movq %rax, Ndiv6 // N / 2
movq %rdx, Nmod6 // N % 2
#ifdef TRMMKERNEL
vmovsd %xmm12, OFFSET
vmovsd %xmm12, KK
#ifndef LEFT
negq KK
#endif
#endif
.L2_0:
movq Ndiv6, J
cmpq $ 0, J
je .L1_0
ALIGN_4
.L2_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L2_02b:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO
decq %rax
jnz .L2_02b
.L2_02c:
movq BO1, B // next offset of B
.L2_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $ 3, I // i = (m >> 3)
je .L2_4_10
ALIGN_4
/**********************************************************************************************************/
.L2_8_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 8, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_8_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_8_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
je .L2_8_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
je .L2_8_16
jmp .L2_8_12
ALIGN_4
.L2_8_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_8_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_8_17:
KERNEL8x2_SUB
jl .L2_8_17
ALIGN_4
.L2_8_19:
SAVE8x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 8, KK
#endif
addq $ 16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L2_8_11
ALIGN_4
/**********************************************************************************************************/
.L2_4_10:
testq $ 7, M
jz .L2_4_60 // to next 2 lines of N
testq $ 4, M
jz .L2_4_20
ALIGN_4
.L2_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_4_16
jmp .L2_4_12
ALIGN_4
.L2_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_17:
KERNEL4x2_SUB
jl .L2_4_17
ALIGN_4
.L2_4_19:
SAVE4x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_4_20:
testq $ 2, M
jz .L2_4_40
ALIGN_4
.L2_4_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_26
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_22:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_4_26
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_4_26
jmp .L2_4_22
ALIGN_4
.L2_4_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_29
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_27:
KERNEL2x2_SUB
jl .L2_4_27
ALIGN_4
.L2_4_29:
vbroadcastss ALPHA_R, %xmm0
vbroadcastss ALPHA_I, %xmm1
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
#else
vaddsubps %xmm8, %xmm9 ,%xmm9
vaddsubps %xmm10, %xmm11,%xmm11
vmovaps %xmm9, %xmm8
vmovaps %xmm11, %xmm10
// swap high and low 64 bytes
vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
#endif
// multiply with ALPHA_R
vmulps %xmm8 , %xmm0, %xmm8
vmulps %xmm10, %xmm0, %xmm10
// multiply with ALPHA_I
vmulps %xmm9 , %xmm1, %xmm9
vmulps %xmm11, %xmm1, %xmm11
vaddsubps %xmm9, %xmm8 , %xmm8
vaddsubps %xmm11,%xmm10, %xmm10
#ifndef TRMMKERNEL
vaddps (CO1), %xmm8 , %xmm8
vaddps (CO1, LDC), %xmm10, %xmm10
#endif
vmovups %xmm8 , (CO1)
vmovups %xmm10 , (CO1, LDC)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L2_4_21
ALIGN_4
/**************************************************************************/
.L2_4_40:
testq $ 1, M
jz .L2_4_60 // to next 2 lines of N
ALIGN_4
.L2_4_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 8 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L2_4_46
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_4_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_4_46
jmp .L2_4_42
ALIGN_4
.L2_4_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L2_4_49
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_4_47:
KERNEL1x2_SUB
jl .L2_4_47
ALIGN_4
.L2_4_49:
SAVE1x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
decq I # i --
jg .L2_4_41
ALIGN_4
.L2_4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $ 2, KK
#endif
decq J // j --
jg .L2_01 // next 2 lines of N
.L1_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $ 1, J // j % 2
je .L999
ALIGN_4
.L1_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_02b:
vmovsd (BO1), %xmm0
vmovsd %xmm0, (BO)
addq $ 2*SIZE,BO1
addq $ 2*SIZE,BO
decq %rax
jnz .L1_02b
.L1_02c:
movq BO1, B // next offset of B
.L1_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $ 3, I // i = (m >> 3)
je .L1_4_10
ALIGN_4
/**************************************************************************************************/
.L1_8_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 8, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_8_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_8_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
je .L1_8_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x1_SUB
je .L1_8_16
jmp .L1_8_12
ALIGN_4
.L1_8_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_8_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_8_17:
KERNEL8x1_SUB
jl .L1_8_17
ALIGN_4
.L1_8_19:
SAVE8x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 4, %rax // rax = rax *16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 8, KK
#endif
addq $ 16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L1_8_11
ALIGN_4
/**************************************************************************************************/
.L1_4_10:
testq $ 7, M
jz .L999
testq $ 4, M
jz .L1_4_20
.L1_4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 4, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_16
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_4_16
jmp .L1_4_12
ALIGN_4
.L1_4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_19
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_17:
KERNEL4x1_SUB
jl .L1_4_17
ALIGN_4
.L1_4_19:
SAVE4x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 4, KK
#endif
addq $ 8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_4_20:
testq $ 2, M
jz .L1_4_40
ALIGN_4
.L1_4_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 2, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_26
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_22:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_4_26
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_4_26
jmp .L1_4_22
ALIGN_4
.L1_4_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_29
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2; number of values
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_27:
KERNEL2x1_SUB
jl .L1_4_27
ALIGN_4
.L1_4_29:
SAVE2x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 2, KK
#endif
addq $ 4 * SIZE, CO1 # coffset += 4
ALIGN_4
/**************************************************************************/
.L1_4_40:
testq $ 1, M
jz .L999 // to next 2 lines of N
ALIGN_4
.L1_4_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $ 4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $ 1, %rax // number of values in AO
#else
addq $ 1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $ -8, %rax // K = K - ( K % 8 )
je .L1_4_46
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_4_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_4_46
jmp .L1_4_42
ALIGN_4
.L1_4_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $ 7, %rax # if (k & 1)
je .L1_4_49
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_4_47:
KERNEL1x1_SUB
jl .L1_4_47
ALIGN_4
.L1_4_49:
SAVE1x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $ 1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $ 1, KK
#endif
addq $ 2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L999:
vzeroupper
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
ret
EPILOGUE
#endif