OpenBLAS/kernel/x86_64/sgemm_kernel_16x4_haswell.S

6807 lines
131 KiB
ArmAsm

/*********************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************
* 2014/07/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
* 2013/10/28 Saar
* Parameter:
* SGEMM_DEFAULT_UNROLL_N 4
* SGEMM_DEFAULT_UNROLL_M 16
* SGEMM_DEFAULT_P 768
* SGEMM_DEFAULT_Q 384
* A_PR1 512
* B_PR1 512
*
*
* 2014/07/28 Saar
* Performance at 9216x9216x9216:
* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83)
* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155)
* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230)
* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267)
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
#define J %r14
#define OLD_K %rdx
#define A %rcx
#define B %r8
#define C %r9
#define LDC %r10
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define K %r12
#define BI %rbp
#define BO2 %rbp
#define SP %rbx
#define BO1 %rdi
#define CO2 %rdx
#ifndef WINDOWS_ABI
#define STACKSIZE 96
#else
#define STACKSIZE 256
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_B 48 + STACKSIZE(%rsp)
#define OLD_C 56 + STACKSIZE(%rsp)
#define OLD_LDC 64 + STACKSIZE(%rsp)
#define OLD_OFFSET 72 + STACKSIZE(%rsp)
#endif
#if defined(OS_WINDOWS)
#define L_BUFFER_SIZE 8192
#else
#define L_BUFFER_SIZE 12288
#endif
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
#define N 40(%rsp)
#define ALPHA 48(%rsp)
#define OFFSET 56(%rsp)
#define KK 64(%rsp)
#define KKK 72(%rsp)
#define BUFFER1 128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
#define STACK_TOUCH \
movl $0, 4096 * 4(%rsp);\
movl $0, 4096 * 3(%rsp);\
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 12288
#define STACK_TOUCH \
movl $0, 4096 * 3(%rsp);\
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 8192
#define STACK_TOUCH \
movl $0, 4096 * 2(%rsp);\
movl $0, 4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 4096
#define STACK_TOUCH \
movl $0, 4096 * 1(%rsp);
#else
#define STACK_TOUCH
#endif
#else
#define STACK_TOUCH
#endif
#if defined(BULLDOZER)
#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
#else
#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
#endif
#define A_PR1 512
#define B_PR1 512
/*******************************************************************************************
* 6 lines of N
*******************************************************************************************/
.macro KERNEL16x6_SUB
vmovups -16 * SIZE(AO), %ymm0
vmovups -8 * SIZE(AO), %ymm1
vbroadcastss -4 * SIZE(BO), %ymm2
vbroadcastss -3 * SIZE(BO), %ymm3
prefetcht0 A_PR1(AO)
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
vbroadcastss -2 * SIZE(BO), %ymm2
vbroadcastss -1 * SIZE(BO), %ymm3
VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
VFMADD231PS_( %ymm9,%ymm2,%ymm1 )
VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
VFMADD231PS_( %ymm11,%ymm3,%ymm1 )
vbroadcastss 0 * SIZE(BO), %ymm2
vbroadcastss 1 * SIZE(BO), %ymm3
VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
VFMADD231PS_( %ymm13,%ymm2,%ymm1 )
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
VFMADD231PS_( %ymm15,%ymm3,%ymm1 )
addq $ 6*SIZE, BO
addq $ 16*SIZE, AO
decq %rax
.endm
.macro SAVE16x6
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm5 , %ymm5
vmulps %ymm0 , %ymm6 , %ymm6
vmulps %ymm0 , %ymm7 , %ymm7
vmulps %ymm0 , %ymm8 , %ymm8
vmulps %ymm0 , %ymm9 , %ymm9
vmulps %ymm0 , %ymm10, %ymm10
vmulps %ymm0 , %ymm11, %ymm11
vmulps %ymm0 , %ymm12, %ymm12
vmulps %ymm0 , %ymm13, %ymm13
vmulps %ymm0 , %ymm14, %ymm14
vmulps %ymm0 , %ymm15, %ymm15
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps 8 * SIZE(CO1), %ymm5,%ymm5
vaddps (CO1, LDC), %ymm6,%ymm6
vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7
vaddps (CO1, LDC,2), %ymm8,%ymm8
vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9
vaddps (CO2), %ymm10,%ymm10
vaddps 8 * SIZE(CO2), %ymm11,%ymm11
vaddps (CO2, LDC), %ymm12,%ymm12
vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13
vaddps (CO2, LDC,2), %ymm14,%ymm14
vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm5 , 8 * SIZE(CO1)
vmovups %ymm6 , (CO1, LDC)
vmovups %ymm7 , 8 * SIZE(CO1, LDC)
vmovups %ymm8 , (CO1, LDC,2)
vmovups %ymm9 , 8 * SIZE(CO1, LDC,2)
vmovups %ymm10, (CO2)
vmovups %ymm11, 8 * SIZE(CO2)
vmovups %ymm12, (CO2, LDC)
vmovups %ymm13, 8 * SIZE(CO2, LDC)
vmovups %ymm14, (CO2, LDC,2)
vmovups %ymm15, 8 * SIZE(CO2, LDC,2)
.endm
/*******************************************************************************************/
.macro KERNEL8x6_SUB
vmovups -16 * SIZE(AO), %ymm0
vbroadcastss -4 * SIZE(BO), %ymm2
vbroadcastss -3 * SIZE(BO), %ymm3
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
vbroadcastss -2 * SIZE(BO), %ymm2
vbroadcastss -1 * SIZE(BO), %ymm3
VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
vbroadcastss 0 * SIZE(BO), %ymm2
vbroadcastss 1 * SIZE(BO), %ymm3
VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
addq $ 6*SIZE, BO
addq $ 8*SIZE, AO
decq %rax
.endm
.macro SAVE8x6
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm6 , %ymm6
vmulps %ymm0 , %ymm8 , %ymm8
vmulps %ymm0 , %ymm10, %ymm10
vmulps %ymm0 , %ymm12, %ymm12
vmulps %ymm0 , %ymm14, %ymm14
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps (CO1, LDC), %ymm6,%ymm6
vaddps (CO1, LDC,2), %ymm8,%ymm8
vaddps (CO2), %ymm10,%ymm10
vaddps (CO2, LDC), %ymm12,%ymm12
vaddps (CO2, LDC,2), %ymm14,%ymm14
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm6 , (CO1, LDC)
vmovups %ymm8 , (CO1, LDC,2)
vmovups %ymm10, (CO2)
vmovups %ymm12, (CO2, LDC)
vmovups %ymm14, (CO2, LDC,2)
.endm
/*******************************************************************************************/
.macro KERNEL4x6_SUB
vmovups -16 * SIZE(AO), %xmm0
vbroadcastss -4 * SIZE(BO), %xmm2
vbroadcastss -3 * SIZE(BO), %xmm3
VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
vbroadcastss -2 * SIZE(BO), %xmm2
vbroadcastss -1 * SIZE(BO), %xmm3
VFMADD231PS_( %xmm8,%xmm2,%xmm0 )
VFMADD231PS_( %xmm10,%xmm3,%xmm0 )
vbroadcastss 0 * SIZE(BO), %xmm2
vbroadcastss 1 * SIZE(BO), %xmm3
VFMADD231PS_( %xmm12,%xmm2,%xmm0 )
VFMADD231PS_( %xmm14,%xmm3,%xmm0 )
addq $ 6*SIZE, BO
addq $ 4*SIZE, AO
decq %rax
.endm
.macro SAVE4x6
vbroadcastss ALPHA, %xmm0
vmulps %xmm0 , %xmm4 , %xmm4
vmulps %xmm0 , %xmm6 , %xmm6
vmulps %xmm0 , %xmm8 , %xmm8
vmulps %xmm0 , %xmm10, %xmm10
vmulps %xmm0 , %xmm12, %xmm12
vmulps %xmm0 , %xmm14, %xmm14
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm4,%xmm4
vaddps (CO1, LDC), %xmm6,%xmm6
vaddps (CO1, LDC,2), %xmm8,%xmm8
vaddps (CO2), %xmm10,%xmm10
vaddps (CO2, LDC), %xmm12,%xmm12
vaddps (CO2, LDC,2), %xmm14,%xmm14
#endif
vmovups %xmm4 , (CO1)
vmovups %xmm6 , (CO1, LDC)
vmovups %xmm8 , (CO1, LDC,2)
vmovups %xmm10, (CO2)
vmovups %xmm12, (CO2, LDC)
vmovups %xmm14, (CO2, LDC,2)
.endm
/*******************************************************************************************/
.macro KERNEL2x6_SUB
vmovss -16 * SIZE(AO), %xmm0
vmovss -15 * SIZE(AO), %xmm1
vmovss -4 * SIZE(BO), %xmm2
vmovss -3 * SIZE(BO), %xmm3
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
vmovss -2 * SIZE(BO), %xmm2
vmovss -1 * SIZE(BO), %xmm3
VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
VFMADD231SS_( %xmm9,%xmm2,%xmm1 )
VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
VFMADD231SS_( %xmm11,%xmm3,%xmm1 )
vmovss 0 * SIZE(BO), %xmm2
vmovss 1 * SIZE(BO), %xmm3
VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
VFMADD231SS_( %xmm13,%xmm2,%xmm1 )
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
VFMADD231SS_( %xmm15,%xmm3,%xmm1 )
addq $ 6*SIZE, BO
addq $ 2*SIZE, AO
decq %rax
.endm
.macro SAVE2x6
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm5 , %xmm5
vmulss %xmm0 , %xmm6 , %xmm6
vmulss %xmm0 , %xmm7 , %xmm7
vmulss %xmm0 , %xmm8 , %xmm8
vmulss %xmm0 , %xmm9 , %xmm9
vmulss %xmm0 , %xmm10, %xmm10
vmulss %xmm0 , %xmm11, %xmm11
vmulss %xmm0 , %xmm12, %xmm12
vmulss %xmm0 , %xmm13, %xmm13
vmulss %xmm0 , %xmm14, %xmm14
vmulss %xmm0 , %xmm15, %xmm15
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss 1 * SIZE(CO1), %xmm5,%xmm5
vaddss (CO1, LDC), %xmm6,%xmm6
vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
vaddss (CO1, LDC,2), %xmm8,%xmm8
vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
vaddss (CO2), %xmm10,%xmm10
vaddss 1 * SIZE(CO2), %xmm11,%xmm11
vaddss (CO2, LDC), %xmm12,%xmm12
vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13
vaddss (CO2, LDC,2), %xmm14,%xmm14
vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm5 , 1 * SIZE(CO1)
vmovss %xmm6 , (CO1, LDC)
vmovss %xmm7 , 1 * SIZE(CO1, LDC)
vmovss %xmm8 , (CO1, LDC,2)
vmovss %xmm9 , 1 * SIZE(CO1, LDC,2)
vmovss %xmm10, (CO2)
vmovss %xmm11, 1 * SIZE(CO2)
vmovss %xmm12, (CO2, LDC)
vmovss %xmm13, 1 * SIZE(CO2, LDC)
vmovss %xmm14, (CO2, LDC,2)
vmovss %xmm15, 1 * SIZE(CO2, LDC,2)
.endm
/*******************************************************************************************/
.macro KERNEL1x6_SUB
vmovss -16 * SIZE(AO), %xmm0
vmovss -4 * SIZE(BO), %xmm2
vmovss -3 * SIZE(BO), %xmm3
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
vmovss -2 * SIZE(BO), %xmm2
vmovss -1 * SIZE(BO), %xmm3
VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
vmovss 0 * SIZE(BO), %xmm2
vmovss 1 * SIZE(BO), %xmm3
VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
addq $ 6*SIZE, BO
addq $ 1*SIZE, AO
decq %rax
.endm
.macro SAVE1x6
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm6 , %xmm6
vmulss %xmm0 , %xmm8 , %xmm8
vmulss %xmm0 , %xmm10, %xmm10
vmulss %xmm0 , %xmm12, %xmm12
vmulss %xmm0 , %xmm14, %xmm14
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss (CO1, LDC), %xmm6,%xmm6
vaddss (CO1, LDC,2), %xmm8,%xmm8
vaddss (CO2), %xmm10,%xmm10
vaddss (CO2, LDC), %xmm12,%xmm12
vaddss (CO2, LDC,2), %xmm14,%xmm14
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm6 , (CO1, LDC)
vmovss %xmm8 , (CO1, LDC,2)
vmovss %xmm10, (CO2)
vmovss %xmm12, (CO2, LDC)
vmovss %xmm14, (CO2, LDC,2)
.endm
/*******************************************************************************************/
/*******************************************************************************************
* 4 lines of N
*******************************************************************************************/
.macro KERNEL16x4_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
VFMADD231PS_( %ymm9,%ymm2,%ymm1 )
VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
VFMADD231PS_( %ymm11,%ymm3,%ymm1 )
addq $ 4 , BI
addq $ 16, %rax
.endm
.macro SAVE16x4
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm5 , %ymm5
vmulps %ymm0 , %ymm6 , %ymm6
vmulps %ymm0 , %ymm7 , %ymm7
vmulps %ymm0 , %ymm8 , %ymm8
vmulps %ymm0 , %ymm9 , %ymm9
vmulps %ymm0 , %ymm10, %ymm10
vmulps %ymm0 , %ymm11, %ymm11
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps 8 * SIZE(CO1), %ymm5,%ymm5
vaddps (CO1, LDC), %ymm6,%ymm6
vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7
vaddps (CO2), %ymm8,%ymm8
vaddps 8 * SIZE(CO2), %ymm9,%ymm9
vaddps (CO2, LDC), %ymm10,%ymm10
vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm5 , 8 * SIZE(CO1)
vmovups %ymm6 , (CO1, LDC)
vmovups %ymm7 , 8 * SIZE(CO1, LDC)
vmovups %ymm8 , (CO2)
vmovups %ymm9 , 8 * SIZE(CO2)
vmovups %ymm10, (CO2, LDC)
vmovups %ymm11, 8 * SIZE(CO2, LDC)
prefetcht0 64(CO1)
prefetcht0 64(CO1, LDC)
prefetcht0 64(CO2)
prefetcht0 64(CO2, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL8x4_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
addq $ 4 , BI
addq $ 8 , %rax
.endm
.macro SAVE8x4
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm6 , %ymm6
vmulps %ymm0 , %ymm8 , %ymm8
vmulps %ymm0 , %ymm10, %ymm10
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps (CO1, LDC), %ymm6,%ymm6
vaddps (CO2), %ymm8,%ymm8
vaddps (CO2, LDC), %ymm10,%ymm10
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm6 , (CO1, LDC)
vmovups %ymm8 , (CO2)
vmovups %ymm10, (CO2, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL4x4_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231PS_( %xmm8,%xmm2,%xmm0 )
VFMADD231PS_( %xmm10,%xmm3,%xmm0 )
addq $ 4 , BI
addq $ 4 , %rax
.endm
.macro SAVE4x4
vbroadcastss ALPHA, %xmm0
vmulps %xmm0 , %xmm4 , %xmm4
vmulps %xmm0 , %xmm6 , %xmm6
vmulps %xmm0 , %xmm8 , %xmm8
vmulps %xmm0 , %xmm10, %xmm10
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm4,%xmm4
vaddps (CO1, LDC), %xmm6,%xmm6
vaddps (CO2), %xmm8,%xmm8
vaddps (CO2, LDC), %xmm10,%xmm10
#endif
vmovups %xmm4 , (CO1)
vmovups %xmm6 , (CO1, LDC)
vmovups %xmm8 , (CO2)
vmovups %xmm10, (CO2, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL2x4_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
VFMADD231SS_( %xmm9,%xmm2,%xmm1 )
VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
VFMADD231SS_( %xmm11,%xmm3,%xmm1 )
addq $ 4 , BI
addq $ 2, %rax
.endm
.macro SAVE2x4
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm5 , %xmm5
vmulss %xmm0 , %xmm6 , %xmm6
vmulss %xmm0 , %xmm7 , %xmm7
vmulss %xmm0 , %xmm8 , %xmm8
vmulss %xmm0 , %xmm9 , %xmm9
vmulss %xmm0 , %xmm10, %xmm10
vmulss %xmm0 , %xmm11, %xmm11
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss 1 * SIZE(CO1), %xmm5,%xmm5
vaddss (CO1, LDC), %xmm6,%xmm6
vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
vaddss (CO2), %xmm8,%xmm8
vaddss 1 * SIZE(CO2), %xmm9,%xmm9
vaddss (CO2, LDC), %xmm10,%xmm10
vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm5 , 1 * SIZE(CO1)
vmovss %xmm6 , (CO1, LDC)
vmovss %xmm7 , 1 * SIZE(CO1, LDC)
vmovss %xmm8 , (CO2)
vmovss %xmm9 , 1 * SIZE(CO2)
vmovss %xmm10, (CO2, LDC)
vmovss %xmm11, 1 * SIZE(CO2, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL1x4_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
addq $ 4 , BI
addq $ 1, %rax
.endm
.macro SAVE1x4
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm6 , %xmm6
vmulss %xmm0 , %xmm8 , %xmm8
vmulss %xmm0 , %xmm10, %xmm10
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss (CO1, LDC), %xmm6,%xmm6
vaddss (CO2), %xmm8,%xmm8
vaddss (CO2, LDC), %xmm10,%xmm10
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm6 , (CO1, LDC)
vmovss %xmm8 , (CO2)
vmovss %xmm10, (CO2, LDC)
.endm
/*******************************************************************************************/
/*******************************************************************************************
* 2 lines of N
*******************************************************************************************/
.macro KERNEL16x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
addq $ 2 , BI
addq $ 16, %rax
.endm
.macro SAVE16x2
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm5 , %ymm5
vmulps %ymm0 , %ymm6 , %ymm6
vmulps %ymm0 , %ymm7 , %ymm7
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps 8 * SIZE(CO1), %ymm5,%ymm5
vaddps (CO1, LDC), %ymm6,%ymm6
vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm5 , 8 * SIZE(CO1)
vmovups %ymm6 , (CO1, LDC)
vmovups %ymm7 , 8 * SIZE(CO1, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL8x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
addq $ 2 , BI
addq $ 8 , %rax
.endm
.macro SAVE8x2
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm6 , %ymm6
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps (CO1, LDC), %ymm6,%ymm6
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm6 , (CO1, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL4x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
addq $ 2 , BI
addq $ 4 , %rax
.endm
.macro SAVE4x2
vbroadcastss ALPHA, %xmm0
vmulps %xmm0 , %xmm4 , %xmm4
vmulps %xmm0 , %xmm6 , %xmm6
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm4,%xmm4
vaddps (CO1, LDC), %xmm6,%xmm6
#endif
vmovups %xmm4 , (CO1)
vmovups %xmm6 , (CO1, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL2x2_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
addq $ 2 , BI
addq $ 2, %rax
.endm
.macro SAVE2x2
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm5 , %xmm5
vmulss %xmm0 , %xmm6 , %xmm6
vmulss %xmm0 , %xmm7 , %xmm7
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss 1 * SIZE(CO1), %xmm5,%xmm5
vaddss (CO1, LDC), %xmm6,%xmm6
vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm5 , 1 * SIZE(CO1)
vmovss %xmm6 , (CO1, LDC)
vmovss %xmm7 , 1 * SIZE(CO1, LDC)
.endm
/*******************************************************************************************/
.macro KERNEL1x2_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
addq $ 2 , BI
addq $ 1, %rax
.endm
.macro SAVE1x2
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm6 , %xmm6
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss (CO1, LDC), %xmm6,%xmm6
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm6 , (CO1, LDC)
.endm
/*******************************************************************************************/
/*******************************************************************************************
* 1 line of N
*******************************************************************************************/
.macro KERNEL16x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
addq $ 1 , BI
addq $ 16, %rax
.endm
.macro SAVE16x1
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
vmulps %ymm0 , %ymm5 , %ymm5
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
vaddps 8 * SIZE(CO1), %ymm5,%ymm5
#endif
vmovups %ymm4 , (CO1)
vmovups %ymm5 , 8 * SIZE(CO1)
.endm
/*******************************************************************************************/
.macro KERNEL8x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
addq $ 1 , BI
addq $ 8 , %rax
.endm
.macro SAVE8x1
vbroadcastss ALPHA, %ymm0
vmulps %ymm0 , %ymm4 , %ymm4
#if !defined(TRMMKERNEL)
vaddps (CO1), %ymm4,%ymm4
#endif
vmovups %ymm4 , (CO1)
.endm
/*******************************************************************************************/
.macro KERNEL4x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
addq $ 1 , BI
addq $ 4 , %rax
.endm
.macro SAVE4x1
vbroadcastss ALPHA, %xmm0
vmulps %xmm0 , %xmm4 , %xmm4
#if !defined(TRMMKERNEL)
vaddps (CO1), %xmm4,%xmm4
#endif
vmovups %xmm4 , (CO1)
.endm
/*******************************************************************************************/
.macro KERNEL2x1_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
addq $ 1 , BI
addq $ 2 , %rax
.endm
.macro SAVE2x1
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
vmulss %xmm0 , %xmm5 , %xmm5
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
vaddss 1 * SIZE(CO1), %xmm5,%xmm5
#endif
vmovss %xmm4 , (CO1)
vmovss %xmm5 , 1 * SIZE(CO1)
.endm
/*******************************************************************************************/
.macro KERNEL1x1_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
addq $ 1 , BI
addq $ 1 , %rax
.endm
.macro SAVE1x1
vmovss ALPHA, %xmm0
vmulss %xmm0 , %xmm4 , %xmm4
#if !defined(TRMMKERNEL)
vaddss (CO1), %xmm4,%xmm4
#endif
vmovss %xmm4 , (CO1)
.endm
/*******************************************************************************************/
#if !defined(TRMMKERNEL)
/*************************************************************************************
* GEMM Kernel
*************************************************************************************/
PROLOGUE
PROFCODE
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $128 + L_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCH
cmpq $0, OLD_M
je .L999
cmpq $0, OLD_N
je .L999
cmpq $0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovss %xmm0, ALPHA
salq $BASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $12, %rdi
divq %rdi // N / 12
movq %rax, Ndiv6 // N / 12
movq %rdx, Nmod6 // N % 12
movq Ndiv6, J
cmpq $0, J
je .L4_00
ALIGN_4
/*******************************************************************************************/
.L6_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
salq $2, %rax // 4 values of B
leaq (B, %rax,4), BO2
movq BO2, B // next offset of B
movq K, %rax
ALIGN_4
.L6_02c:
vmovups (BO1), %xmm0
vmovsd (BO2), %xmm1
vmovups %xmm0, (BO)
vmovsd %xmm1, 4*SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO2
addq $ 6*SIZE,BO
decq %rax
jnz .L6_02c
.L6_10:
movq C, CO1
leaq (C, LDC, 2), CO2
leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc
leaq (C, LDC, 4), C
leaq (C, LDC, 2), C // c = c + 6 * ldc
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L6_20
ALIGN_4
.L6_11:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax // K = K - ( K % 8 )
je .L6_16
ALIGN_4
.L6_12:
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
je .L6_16
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
je .L6_16
jmp .L6_12
ALIGN_4
.L6_16:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L6_19
ALIGN_4
.L6_17:
KERNEL16x6_SUB
jnz .L6_17
ALIGN_4
.L6_19:
SAVE16x6
addq $16 * SIZE, CO1 # coffset += 16
addq $16 * SIZE, CO2 # coffset += 16
decq I # i --
jg .L6_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L6_20:
// Test rest of M
testq $15, M
jz .L6_60 // to next 6 lines of N
testq $8, M
jz .L6_21pre
ALIGN_4
/**************************************************************************/
.L6_20_1:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L6_20_6
ALIGN_4
.L6_20_2:
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
je .L6_20_6
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
je .L6_20_6
jmp .L6_20_2
ALIGN_4
.L6_20_6:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L6_20_9
ALIGN_4
.L6_20_7:
KERNEL8x6_SUB
jnz .L6_20_7
ALIGN_4
.L6_20_9:
SAVE8x6
addq $8 * SIZE, CO1 # coffset += 8
addq $8 * SIZE, CO2 # coffset += 8
ALIGN_4
/**************************************************************************/
.L6_21pre:
testq $4, M
jz .L6_30
ALIGN_4
.L6_21:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L6_26
ALIGN_4
.L6_22:
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
je .L6_26
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
je .L6_26
jmp .L6_22
ALIGN_4
.L6_26:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L6_29
ALIGN_4
.L6_27:
KERNEL4x6_SUB
jnz .L6_27
ALIGN_4
.L6_29:
SAVE4x6
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
ALIGN_4
.L6_30:
testq $2, M
jz .L6_40
ALIGN_4
.L6_31:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L6_36
ALIGN_4
.L6_32:
prefetcht0 A_PR1(AO)
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
je .L6_36
prefetcht0 A_PR1(AO)
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
je .L6_36
jmp .L6_32
ALIGN_4
.L6_36:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L6_39
ALIGN_4
.L6_37:
KERNEL2x6_SUB
jnz .L6_37
ALIGN_4
.L6_39:
SAVE2x6
addq $2 * SIZE, CO1 # coffset += 2
addq $2 * SIZE, CO2 # coffset += 2
ALIGN_4
.L6_40:
testq $1, M
jz .L6_60 // to next 4 lines of N
ALIGN_4
.L6_41:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L6_46
ALIGN_4
.L6_42:
prefetcht0 A_PR1(AO)
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
je .L6_46
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
je .L6_46
jmp .L6_42
ALIGN_4
.L6_46:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L6_49
ALIGN_4
.L6_47:
KERNEL1x6_SUB
jnz .L6_47
ALIGN_4
.L6_49:
SAVE1x6
addq $1 * SIZE, CO1 # coffset += 1
addq $1 * SIZE, CO2 # coffset += 1
ALIGN_4
.L6_60:
/*******************************************************************************************/
.L7_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
salq $2, %rax // 4 values of B
leaq (B, %rax,4), BO2
movq K, %rax
ALIGN_4
.L7_02c:
vmovsd 2*SIZE(BO1), %xmm0
vmovups (BO2), %xmm1
vmovsd %xmm0, (BO)
vmovups %xmm1, 2*SIZE(BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO2
addq $ 6*SIZE,BO
decq %rax
jnz .L7_02c
movq BO2, B // next offset of B
.L7_10:
movq C, CO1
leaq (C, LDC, 2), CO2
leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc
leaq (C, LDC, 4), C
leaq (C, LDC, 2), C // c = c + 6 * ldc
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L7_20
ALIGN_4
.L7_11:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax // K = K - ( K % 8 )
je .L7_16
ALIGN_4
.L7_12:
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
je .L7_16
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
KERNEL16x6_SUB
je .L7_16
jmp .L7_12
ALIGN_4
.L7_16:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L7_19
ALIGN_4
.L7_17:
KERNEL16x6_SUB
jnz .L7_17
ALIGN_4
.L7_19:
SAVE16x6
addq $16 * SIZE, CO1 # coffset += 16
addq $16 * SIZE, CO2 # coffset += 16
decq I # i --
jg .L7_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L7_20:
// Test rest of M
testq $15, M
jz .L7_60 // to next 6 lines of N
testq $8, M
jz .L7_21pre
ALIGN_4
/**************************************************************************/
.L7_20_1:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L7_20_6
ALIGN_4
.L7_20_2:
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
je .L7_20_6
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
prefetcht0 A_PR1(AO)
KERNEL8x6_SUB
KERNEL8x6_SUB
je .L7_20_6
jmp .L7_20_2
ALIGN_4
.L7_20_6:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L7_20_9
ALIGN_4
.L7_20_7:
KERNEL8x6_SUB
jnz .L7_20_7
ALIGN_4
.L7_20_9:
SAVE8x6
addq $8 * SIZE, CO1 # coffset += 8
addq $8 * SIZE, CO2 # coffset += 8
ALIGN_4
/**************************************************************************/
.L7_21pre:
testq $4, M
jz .L7_30
ALIGN_4
.L7_21:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L7_26
ALIGN_4
.L7_22:
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
je .L7_26
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
prefetcht0 A_PR1(AO)
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
KERNEL4x6_SUB
je .L7_26
jmp .L7_22
ALIGN_4
.L7_26:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L7_29
ALIGN_4
.L7_27:
KERNEL4x6_SUB
jnz .L7_27
ALIGN_4
.L7_29:
SAVE4x6
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
ALIGN_4
.L7_30:
testq $2, M
jz .L7_40
ALIGN_4
.L7_31:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L7_36
ALIGN_4
.L7_32:
prefetcht0 A_PR1(AO)
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
je .L7_36
prefetcht0 A_PR1(AO)
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
KERNEL2x6_SUB
je .L7_36
jmp .L7_32
ALIGN_4
.L7_36:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L7_39
ALIGN_4
.L7_37:
KERNEL2x6_SUB
jnz .L7_37
ALIGN_4
.L7_39:
SAVE2x6
addq $2 * SIZE, CO1 # coffset += 2
addq $2 * SIZE, CO2 # coffset += 2
ALIGN_4
.L7_40:
testq $1, M
jz .L7_60 // to next 4 lines of N
ALIGN_4
.L7_41:
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
vzeroall
movq K, %rax
andq $-8, %rax
je .L7_46
ALIGN_4
.L7_42:
prefetcht0 A_PR1(AO)
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
je .L7_46
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
KERNEL1x6_SUB
je .L7_46
jmp .L7_42
ALIGN_4
.L7_46:
movq K, %rax
andq $7, %rax # if (k & 1)
je .L7_49
ALIGN_4
.L7_47:
KERNEL1x6_SUB
jnz .L7_47
ALIGN_4
.L7_49:
SAVE1x6
addq $1 * SIZE, CO1 # coffset += 1
addq $1 * SIZE, CO2 # coffset += 1
ALIGN_4
.L7_60:
decq J // j --
jg .L6_01 // next 12 lines of N
/*******************************************************************************************/
.L4_00:
movq Nmod6, J
sarq $2, J // j = j / 4
cmpq $ 0, J
je .L2_00
ALIGN_4
.L4_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
sarq $2, %rax // K / 4
jz .L4_01b
ALIGN_4
.L4_01a:
prefetcht0 512(BO1)
prefetchw 512(BO)
vmovups (BO1), %xmm0
vmovups 4*SIZE(BO1), %xmm1
vmovups 8*SIZE(BO1), %xmm2
vmovups 12*SIZE(BO1), %xmm3
vmovups %xmm0, (BO)
vmovups %xmm1, 4*SIZE(BO)
vmovups %xmm2, 8*SIZE(BO)
vmovups %xmm3,12*SIZE(BO)
addq $ 16*SIZE,BO1
addq $ 16*SIZE,BO
decq %rax
jnz .L4_01a
.L4_01b:
movq K, %rax
andq $3, %rax // K % 4
jz .L4_02d
ALIGN_4
.L4_02c:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO
decq %rax
jnz .L4_02c
.L4_02d:
movq BO1, B // next offset of B
.L4_10:
movq C, CO1
leaq (C, LDC, 2), CO2
leaq (C, LDC, 4), C // c += 4 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L4_20
ALIGN_4
.L4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $16, %rax // number of values in AO
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L4_16
movq %rax, BI // Index for BO
leaq (,BI,4) , BI // BI = BI * 4 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_12:
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
je .L4_16
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
je .L4_16
jmp .L4_12
ALIGN_4
.L4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_19
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_17:
KERNEL16x4_SUB
jl .L4_17
ALIGN_4
.L4_19:
SAVE16x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $16, KK
#endif
addq $16 * SIZE, CO1 # coffset += 16
addq $16 * SIZE, CO2 # coffset += 16
decq I # i --
jg .L4_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L4_20:
// Test rest of M
testq $15, M
jz .L4_60 // to next 3 lines of N
testq $8, M
jz .L4_21pre
ALIGN_4
/**************************************************************************/
.L4_20_1:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $8, %rax // number of values in A
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_20_6
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_20_2:
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
je .L4_20_6
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
je .L4_20_6
jmp .L4_20_2
ALIGN_4
.L4_20_6:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_20_9
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_20_7:
KERNEL8x4_SUB
jl .L4_20_7
ALIGN_4
.L4_20_9:
SAVE8x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $8, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
addq $8 * SIZE, CO2 # coffset += 8
ALIGN_4
/**************************************************************************/
.L4_21pre:
testq $4, M
jz .L4_30
ALIGN_4
.L4_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in A
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_26
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
je .L4_26
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
je .L4_26
jmp .L4_22
ALIGN_4
.L4_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_29
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_27:
KERNEL4x4_SUB
jl .L4_27
ALIGN_4
.L4_29:
SAVE4x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
ALIGN_4
.L4_30:
testq $2, M
jz .L4_40
ALIGN_4
.L4_31:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_36
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_32:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
je .L4_36
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
je .L4_36
jmp .L4_32
ALIGN_4
.L4_36:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_39
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_37:
KERNEL2x4_SUB
jl .L4_37
ALIGN_4
.L4_39:
SAVE2x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
addq $2 * SIZE, CO2 # coffset += 2
ALIGN_4
.L4_40:
testq $1, M
jz .L4_60 // to next 4 lines of N
ALIGN_4
.L4_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_46
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_42:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
je .L4_46
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
je .L4_46
jmp .L4_42
ALIGN_4
.L4_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_49
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_47:
KERNEL1x4_SUB
jl .L4_47
ALIGN_4
.L4_49:
SAVE1x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $1 * SIZE, CO1 # coffset += 1
addq $1 * SIZE, CO2 # coffset += 1
ALIGN_4
.L4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $4, KK
#endif
decq J // j --
jg .L4_01 // next 4 lines of N
/*******************************************************************************************/
.L2_00:
movq Nmod6, J
andq $3, J // j % 4
je .L999
movq Nmod6, J
andq $2, J // j % 4
je .L1_0
.L2_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
sarq $2, %rax // K / 4
jz .L2_01b
ALIGN_4
.L2_01a:
vmovsd (BO1), %xmm0
vmovsd 2*SIZE(BO1), %xmm1
vmovsd 4*SIZE(BO1), %xmm2
vmovsd 6*SIZE(BO1), %xmm3
vmovsd %xmm0, (BO)
vmovsd %xmm1, 2*SIZE(BO)
vmovsd %xmm2, 4*SIZE(BO)
vmovsd %xmm3, 6*SIZE(BO)
addq $8*SIZE,BO1
addq $8*SIZE,BO
decq %rax
jnz .L2_01a
.L2_01b:
movq K, %rax
andq $3, %rax // K % 4
jz .L2_02d
ALIGN_4
.L2_02c:
vmovsd (BO1), %xmm0
vmovsd %xmm0, (BO)
addq $2*SIZE,BO1
addq $2*SIZE,BO
decq %rax
jnz .L2_02c
.L2_02d:
movq BO1, B // next offset of B
.L2_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L2_20
ALIGN_4
.L2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $16, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_16
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_12:
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
je .L2_16
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
je .L2_16
jmp .L2_12
ALIGN_4
.L2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_19
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_17:
KERNEL16x2_SUB
jl .L2_17
ALIGN_4
.L2_19:
SAVE16x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $16, KK
#endif
addq $16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L2_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_20:
// Test rest of M
testq $15, M
jz .L2_60 // to next 2 lines of N
testq $8, M
jz .L2_21pre
ALIGN_4
/**************************************************************************/
.L2_20_1:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $8, %rax // number of values in A
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_20_6
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_20_2:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
je .L2_20_6
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
je .L2_20_6
jmp .L2_20_2
ALIGN_4
.L2_20_6:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_20_9
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_20_7:
KERNEL8x2_SUB
jl .L2_20_7
ALIGN_4
.L2_20_9:
SAVE8x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $8, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************/
.L2_21pre:
testq $4, M
jz .L2_30
ALIGN_4
.L2_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in A
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_26
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_26
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_26
jmp .L2_22
ALIGN_4
.L2_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_29
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_27:
KERNEL4x2_SUB
jl .L2_27
ALIGN_4
.L2_29:
SAVE4x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
ALIGN_4
.L2_30:
testq $2, M
jz .L2_40
ALIGN_4
.L2_31:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_36
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_32:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_36
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_36
jmp .L2_32
ALIGN_4
.L2_36:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_39
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_37:
KERNEL2x2_SUB
jl .L2_37
ALIGN_4
.L2_39:
SAVE2x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L2_40:
testq $1, M
jz .L2_60 // to next 2 lines of N
ALIGN_4
.L2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_46
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_42:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_46
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_46
jmp .L2_42
ALIGN_4
.L2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_49
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_47:
KERNEL1x2_SUB
jl .L2_47
ALIGN_4
.L2_49:
SAVE1x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $1 * SIZE, CO1 # coffset += 1
ALIGN_4
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
.L1_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $1, J // j % 2
je .L999
ALIGN_4
.L1_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_02b:
vmovss (BO1), %xmm0
vmovss %xmm0, (BO)
addq $1*SIZE,BO1
addq $1*SIZE,BO
decq %rax
jnz .L1_02b
.L1_02c:
movq BO1, B // next offset of B
.L1_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L1_20
ALIGN_4
.L1_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $16, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_16
movq %rax, BI // Index for BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_12:
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
je .L1_16
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
je .L1_16
jmp .L1_12
ALIGN_4
.L1_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_19
movq %rax, BI // Index for BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_17:
KERNEL16x1_SUB
jl .L1_17
ALIGN_4
.L1_19:
SAVE16x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $16, KK
#endif
addq $16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L1_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_20:
// Test rest of M
testq $15, M
jz .L999
testq $8, M
jz .L1_21pre
ALIGN_4
/**************************************************************************/
.L1_20_1:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $8, %rax // number of values in A
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_20_6
movq %rax, BI // Index for BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_20_2:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
je .L1_20_6
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
je .L1_20_6
jmp .L1_20_2
ALIGN_4
.L1_20_6:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_20_9
movq %rax, BI // Index for BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_20_7:
KERNEL8x1_SUB
jl .L1_20_7
ALIGN_4
.L1_20_9:
SAVE8x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $8, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************/
.L1_21pre:
testq $4, M
jz .L1_30
ALIGN_4
.L1_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in A
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_26
movq %rax, BI // Index for BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_26
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_26
jmp .L1_22
ALIGN_4
.L1_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_29
movq %rax, BI // Index for BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_27:
KERNEL4x1_SUB
jl .L1_27
ALIGN_4
.L1_29:
SAVE4x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
ALIGN_4
.L1_30:
testq $2, M
jz .L1_40
ALIGN_4
.L1_31:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_36
movq %rax, BI // Index for BO
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_32:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_36
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_36
jmp .L1_32
ALIGN_4
.L1_36:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_39
movq %rax, BI // Index for BO
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_37:
KERNEL2x1_SUB
jl .L1_37
ALIGN_4
.L1_39:
SAVE2x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L1_40:
testq $1, M
jz .L999
ALIGN_4
.L1_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_46
movq %rax, BI // Index for BO
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_42:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_46
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_46
jmp .L1_42
ALIGN_4
.L1_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_49
movq %rax, BI // Index for BO
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_47:
KERNEL1x1_SUB
jl .L1_47
ALIGN_4
.L1_49:
SAVE1x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $1 * SIZE, CO1 # coffset += 1
ALIGN_4
.L999:
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
ret
EPILOGUE
#else
/*************************************************************************************
* TRMM Kernel
*************************************************************************************/
PROLOGUE
PROFCODE
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
vzeroupper
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
movq ARG3, OLD_K
movq OLD_A, A
movq OLD_B, B
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
movq %rsp, SP # save old stack
subq $128 + L_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
STACK_TOUCH
cmpq $0, OLD_M
je .L999
cmpq $0, OLD_N
je .L999
cmpq $0, OLD_K
je .L999
movq OLD_M, M
movq OLD_N, N
movq OLD_K, K
vmovss %xmm0, ALPHA
salq $BASE_SHIFT, LDC
movq N, %rax
xorq %rdx, %rdx
movq $4, %rdi
divq %rdi // N / 4
movq %rax, Ndiv6 // N / 4
movq %rdx, Nmod6 // N % 4
#ifdef TRMMKERNEL
vmovsd %xmm12, OFFSET
vmovsd %xmm12, KK
#ifndef LEFT
negq KK
#endif
#endif
movq Ndiv6, J
cmpq $0, J
je .L2_0
ALIGN_4
/*******************************************************************************************/
.L4_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
sarq $2, %rax // K / 4
jz .L4_01b
ALIGN_4
.L4_01a:
prefetcht0 512(BO1)
prefetchw 512(BO)
vmovups (BO1), %xmm0
vmovups 4*SIZE(BO1), %xmm1
vmovups 8*SIZE(BO1), %xmm2
vmovups 12*SIZE(BO1), %xmm3
vmovups %xmm0, (BO)
vmovups %xmm1, 4*SIZE(BO)
vmovups %xmm2, 8*SIZE(BO)
vmovups %xmm3,12*SIZE(BO)
addq $ 16*SIZE,BO1
addq $ 16*SIZE,BO
decq %rax
jnz .L4_01a
.L4_01b:
movq K, %rax
andq $3, %rax // K % 4
jz .L4_02d
ALIGN_4
.L4_02c:
vmovups (BO1), %xmm0
vmovups %xmm0, (BO)
addq $ 4*SIZE,BO1
addq $ 4*SIZE,BO
decq %rax
jnz .L4_02c
.L4_02d:
movq BO1, B // next offset of B
.L4_10:
movq C, CO1
leaq (C, LDC, 2), CO2
leaq (C, LDC, 4), C // c += 4 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $ 16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L4_20
ALIGN_4
.L4_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $16, %rax // number of values in AO
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L4_16
movq %rax, BI // Index for BO
leaq (,BI,4) , BI // BI = BI * 4 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_12:
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
je .L4_16
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
prefetcht0 B_PR1(BO, BI , SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
prefetcht0 A_PR1(AO, %rax, SIZE)
KERNEL16x4_SUB
je .L4_16
jmp .L4_12
ALIGN_4
.L4_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_19
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_17:
KERNEL16x4_SUB
jl .L4_17
ALIGN_4
.L4_19:
SAVE16x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $16, KK
#endif
addq $16 * SIZE, CO1 # coffset += 16
addq $16 * SIZE, CO2 # coffset += 16
decq I # i --
jg .L4_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L4_20:
// Test rest of M
testq $15, M
jz .L4_60 // to next 3 lines of N
testq $8, M
jz .L4_21pre
ALIGN_4
/**************************************************************************/
.L4_20_1:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $8, %rax // number of values in A
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_20_6
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_20_2:
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
je .L4_20_6
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
je .L4_20_6
jmp .L4_20_2
ALIGN_4
.L4_20_6:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_20_9
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_20_7:
KERNEL8x4_SUB
jl .L4_20_7
ALIGN_4
.L4_20_9:
SAVE8x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $8, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
addq $8 * SIZE, CO2 # coffset += 8
ALIGN_4
/**************************************************************************/
.L4_21pre:
testq $4, M
jz .L4_30
ALIGN_4
.L4_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in A
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_26
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
je .L4_26
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
je .L4_26
jmp .L4_22
ALIGN_4
.L4_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_29
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_27:
KERNEL4x4_SUB
jl .L4_27
ALIGN_4
.L4_29:
SAVE4x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
ALIGN_4
.L4_30:
testq $2, M
jz .L4_40
ALIGN_4
.L4_31:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_36
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_32:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
je .L4_36
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
je .L4_36
jmp .L4_32
ALIGN_4
.L4_36:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_39
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_37:
KERNEL2x4_SUB
jl .L4_37
ALIGN_4
.L4_39:
SAVE2x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
addq $2 * SIZE, CO2 # coffset += 2
ALIGN_4
.L4_40:
testq $1, M
jz .L4_60 // to next 4 lines of N
ALIGN_4
.L4_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $4, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L4_46
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_42:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
je .L4_46
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
je .L4_46
jmp .L4_42
ALIGN_4
.L4_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L4_49
movq %rax, BI // Index for BO
leaq (,BI,4), BI // BI = BI * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L4_47:
KERNEL1x4_SUB
jl .L4_47
ALIGN_4
.L4_49:
SAVE1x4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (,BI, 4), BI // BI = BI * 4 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $1 * SIZE, CO1 # coffset += 1
addq $1 * SIZE, CO2 # coffset += 1
ALIGN_4
.L4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $4, KK
#endif
decq J // j --
jg .L4_01 // next 4 lines of N
/*******************************************************************************************/
.L2_0:
movq Nmod6, J
andq $3, J // j % 4
je .L999
movq Nmod6, J
andq $2, J // j % 4
je .L1_0
.L2_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
sarq $2, %rax // K / 4
jz .L2_01b
ALIGN_4
.L2_01a:
vmovsd (BO1), %xmm0
vmovsd 2*SIZE(BO1), %xmm1
vmovsd 4*SIZE(BO1), %xmm2
vmovsd 6*SIZE(BO1), %xmm3
vmovsd %xmm0, (BO)
vmovsd %xmm1, 2*SIZE(BO)
vmovsd %xmm2, 4*SIZE(BO)
vmovsd %xmm3, 6*SIZE(BO)
addq $8*SIZE,BO1
addq $8*SIZE,BO
decq %rax
jnz .L2_01a
.L2_01b:
movq K, %rax
andq $3, %rax // K % 4
jz .L2_02d
ALIGN_4
.L2_02c:
vmovsd (BO1), %xmm0
vmovsd %xmm0, (BO)
addq $2*SIZE,BO1
addq $2*SIZE,BO
decq %rax
jnz .L2_02c
.L2_02d:
movq BO1, B // next offset of B
.L2_10:
movq C, CO1
leaq (C, LDC, 2), C // c += 2 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L2_20
ALIGN_4
.L2_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $16, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L2_16
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_12:
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
je .L2_16
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
je .L2_16
jmp .L2_12
ALIGN_4
.L2_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_19
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_17:
KERNEL16x2_SUB
jl .L2_17
ALIGN_4
.L2_19:
SAVE16x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $16, KK
#endif
addq $16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L2_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L2_20:
// Test rest of M
testq $15, M
jz .L2_60 // to next 2 lines of N
testq $8, M
jz .L2_21pre
ALIGN_4
/**************************************************************************/
.L2_20_1:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $8, %rax // number of values in A
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_20_6
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_20_2:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
je .L2_20_6
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
je .L2_20_6
jmp .L2_20_2
ALIGN_4
.L2_20_6:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_20_9
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_20_7:
KERNEL8x2_SUB
jl .L2_20_7
ALIGN_4
.L2_20_9:
SAVE8x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $8, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************/
.L2_21pre:
testq $4, M
jz .L2_30
ALIGN_4
.L2_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in A
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_26
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_26
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
je .L2_26
jmp .L2_22
ALIGN_4
.L2_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_29
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_27:
KERNEL4x2_SUB
jl .L2_27
ALIGN_4
.L2_29:
SAVE4x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
ALIGN_4
.L2_30:
testq $2, M
jz .L2_40
ALIGN_4
.L2_31:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_36
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_32:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_36
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
je .L2_36
jmp .L2_32
ALIGN_4
.L2_36:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_39
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_37:
KERNEL2x2_SUB
jl .L2_37
ALIGN_4
.L2_39:
SAVE2x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L2_40:
testq $1, M
jz .L2_60 // to next 2 lines of N
ALIGN_4
.L2_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $2, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L2_46
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_42:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_46
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
je .L2_46
jmp .L2_42
ALIGN_4
.L2_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L2_49
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L2_47:
KERNEL1x2_SUB
jl .L2_47
ALIGN_4
.L2_49:
SAVE1x2
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $1 * SIZE, CO1 # coffset += 1
ALIGN_4
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
.L1_0:
/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/
movq Nmod6, J
andq $1, J // j % 2
je .L999
ALIGN_4
.L1_01:
// copy to sub buffer
movq B, BO1
leaq BUFFER1, BO // first buffer to BO
movq K, %rax
ALIGN_4
.L1_02b:
vmovss (BO1), %xmm0
vmovss %xmm0, (BO)
addq $1*SIZE,BO1
addq $1*SIZE,BO
decq %rax
jnz .L1_02b
.L1_02c:
movq BO1, B // next offset of B
.L1_10:
movq C, CO1
leaq (C, LDC, 1), C // c += 1 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
#endif
movq A, AO // aoffset = a
addq $16 * SIZE, AO
movq M, I
sarq $4, I // i = (m >> 4)
je .L1_20
ALIGN_4
.L1_11:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $16, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax // K = K - ( K % 8 )
je .L1_16
movq %rax, BI // Index for BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_12:
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
je .L1_16
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
je .L1_16
jmp .L1_12
ALIGN_4
.L1_16:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_19
movq %rax, BI // Index for BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_17:
KERNEL16x1_SUB
jl .L1_17
ALIGN_4
.L1_19:
SAVE16x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $4, %rax // rax = rax * 16 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $16, KK
#endif
addq $16 * SIZE, CO1 # coffset += 16
decq I # i --
jg .L1_11
ALIGN_4
/**************************************************************************
* Rest of M
***************************************************************************/
.L1_20:
// Test rest of M
testq $15, M
jz .L999
testq $8, M
jz .L1_21pre
ALIGN_4
/**************************************************************************/
.L1_20_1:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $8, %rax // number of values in A
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_20_6
movq %rax, BI // Index for BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_20_2:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
je .L1_20_6
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
je .L1_20_6
jmp .L1_20_2
ALIGN_4
.L1_20_6:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_20_9
movq %rax, BI // Index for BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_20_7:
KERNEL8x1_SUB
jl .L1_20_7
ALIGN_4
.L1_20_9:
SAVE8x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $3, %rax // rax = rax * 8 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $8, KK
#endif
addq $8 * SIZE, CO1 # coffset += 8
ALIGN_4
/**************************************************************************/
.L1_21pre:
testq $4, M
jz .L1_30
ALIGN_4
.L1_21:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $4, %rax // number of values in A
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_26
movq %rax, BI // Index for BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_26
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
je .L1_26
jmp .L1_22
ALIGN_4
.L1_26:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_29
movq %rax, BI // Index for BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_27:
KERNEL4x1_SUB
jl .L1_27
ALIGN_4
.L1_29:
SAVE4x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $2, %rax // rax = rax * 4 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $4, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
ALIGN_4
.L1_30:
testq $2, M
jz .L1_40
ALIGN_4
.L1_31:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $2, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_36
movq %rax, BI // Index for BO
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_32:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_36
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
je .L1_36
jmp .L1_32
ALIGN_4
.L1_36:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_39
movq %rax, BI // Index for BO
salq $1, %rax // rax = rax *2 ; number of values
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_37:
KERNEL2x1_SUB
jl .L1_37
ALIGN_4
.L1_39:
SAVE2x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
salq $1, %rax // rax = rax * 2 ; number of values
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
addq $2 * SIZE, CO1 # coffset += 2
ALIGN_4
.L1_40:
testq $1, M
jz .L999
ALIGN_4
.L1_41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
#else
movq KK, %rax
leaq BUFFER1, BO // first buffer to BO
addq $4 * SIZE, BO
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
vzeroall
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
addq $1, %rax // number of values in AO
#else
addq $1, %rax // number of values in BO
#endif
movq %rax, KKK
#endif
andq $-8, %rax
je .L1_46
movq %rax, BI // Index for BO
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_42:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_46
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
je .L1_46
jmp .L1_42
ALIGN_4
.L1_46:
#ifndef TRMMKERNEL
movq K, %rax
#else
movq KKK, %rax
#endif
andq $7, %rax # if (k & 1)
je .L1_49
movq %rax, BI // Index for BO
leaq (AO, %rax, SIZE), AO
leaq (BO, BI, SIZE), BO
negq BI
negq %rax
ALIGN_4
.L1_47:
KERNEL1x1_SUB
jl .L1_47
ALIGN_4
.L1_49:
SAVE1x1
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
subq KKK, %rax
movq %rax, BI // Index for BO
leaq (BO, BI, SIZE), BO
leaq (AO, %rax, SIZE), AO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
addq $1 * SIZE, CO1 # coffset += 1
ALIGN_4
.L999:
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
ret
EPILOGUE
#endif