From 1c65b0a686f4e9aa8ed80364bea380e824e98209 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 16 Jul 2019 22:13:26 +0800 Subject: [PATCH] Add files via upload replaced most "vpermpd" with cheaper "vpermilpd" or "vperm2f128" to eliminate performance penalty on zen2 chips --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4795 ++++++++++++++++++++++ 1 file changed, 4795 insertions(+) create mode 100644 kernel/x86_64/dgemm_kernel_4x8_haswell.S diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S new file mode 100644 index 000000000..d12d71bfd --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -0,0 +1,4795 @@ +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermilpd $5 ,%ymm5, %ymm5 //original code: vpermpd $ 0xb1, %ymm5 , %ymm5 + vpermilpd $5 ,%ymm7, %ymm7 //original code: vpermpd $ 0xb1, %ymm7 , %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + +/* original code: + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 +*/ + +//new_code + vperm2f128 $1, %ymm2, %ymm2 , %ymm2 + vperm2f128 $1, %ymm3, %ymm3 , %ymm3 +//end_of_new_code + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermilpd $5 ,%ymm9 , %ymm9 //original code: vpermpd $ 0xb1, %ymm9 , %ymm9 + vpermilpd $5 ,%ymm11, %ymm11 //original code: vpermpd $ 0xb1, %ymm11 , %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + +/* original code: + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 +*/ + +//new_code + vperm2f128 $1, %ymm2, %ymm2 , %ymm2 + vperm2f128 $1, %ymm3, %ymm3 , %ymm3 +//end_of_new_code + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermilpd $5 ,%ymm13, %ymm13 //original code: vpermpd $ 0xb1, %ymm13 , %ymm13 + vpermilpd $5 ,%ymm15, %ymm15 //original code: vpermpd $ 0xb1, %ymm15 , %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + +/* original code: + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 +*/ + +//new_code + vperm2f128 $1, %ymm2, %ymm2 , %ymm2 + vperm2f128 $1, %ymm3, %ymm3 , %ymm3 +//end_of_new_code + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ + + +.macro INIT4x8 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + +.endm + +.macro KERNEL4x8_I + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vmulpd %ymm0 ,%ymm2 , %ymm8 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, BO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -4 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 0 * SIZE(BO), %ymm2 + addq $ 16*SIZE, BO +.endm + + +.macro KERNEL4x8_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + addq $ 8*SIZE, BO +.endm + +.macro KERNEL4x8_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 8*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + +.endm + + +.macro SAVE4x8 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vpermilpd $5 , %ymm5, %ymm5 //original code: vpermpd $ 0xb1, %ymm5 , %ymm5 + vpermilpd $5 , %ymm7, %ymm7 //original code: vpermpd $ 0xb1, %ymm7 , %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + +/* original code: + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 +*/ + +//new_code + vperm2f128 $1, %ymm2, %ymm2 , %ymm2 + vperm2f128 $1, %ymm3, %ymm3 , %ymm3 +//end_of_new_code + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermilpd $5 , %ymm9 , %ymm9 //original code: vpermpd $ 0xb1, %ymm9 , %ymm9 + vpermilpd $5 , %ymm11, %ymm11 //original code: vpermpd $ 0xb1, %ymm11 , %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + +/* original code: + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 +*/ + +//new_code + vperm2f128 $1, %ymm2, %ymm2 , %ymm2 + vperm2f128 $1, %ymm3, %ymm3 , %ymm3 +//end_of_new_code + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL2x8_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 2*SIZE, AO + +.endm + +.macro SAVE2x8 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL1x8_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 1*SIZE, AO + +.endm + +.macro SAVE1x8 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + + +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO + vpermilpd $5 ,%ymm0 , %ymm0 //original code: vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermilpd $5 ,%ymm5, %ymm5 //original code: vpermpd $ 0xb1, %ymm5 , %ymm5 + vpermilpd $5 ,%ymm7, %ymm7 //original code: vpermpd $ 0xb1, %ymm7 , %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + +/* original code: + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 +*/ + +//new_code + vperm2f128 $1, %ymm2, %ymm2 , %ymm2 + vperm2f128 $1, %ymm3, %ymm3 , %ymm3 +//end_of_new_code + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $24, %rdi + divq %rdi // N / 24 + movq %rax, Ndiv12 // N / 24 + movq %rdx, Nmod12 // N % 24 + + + movq Ndiv12, J + cmpq $ 0, J + je .L8_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values from BO1 + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + movq BO2 , B + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm2 + vmovups 0 * SIZE(BO2), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + + +/**************************************************************************************************/ + +.L13_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values + movq B, BO2 + leaq (B,%rax, SIZE), BO3 // next offset to BO2 + leaq (BO3,%rax, SIZE), B // next offset to B + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + + +.L13_02b: + + vmovups 4 * SIZE(BO2), %ymm1 + vmovups 0 * SIZE(BO3), %ymm2 + vmovups 4 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO2 + addq $ 8*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L13_02b + + + +.L13_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L13_20 + + ALIGN_4 + +.L13_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L13_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L13_12a + + ALIGN_5 +.L13_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L13_12 + +.L13_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_13: + + test $1, %rax + jz .L13_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_14: + + INIT4x12 + + +.L13_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_19 + + ALIGN_4 + +.L13_17: + + KERNEL4x12_SUB + + dec %rax + jne .L13_17 + ALIGN_4 + + +.L13_19: + + SAVE4x12 + + decq I # i -- + jne .L13_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L13_20: + // Test rest of M + + testq $3, M + jz .L13_100 // to next 16 lines of N + + +.L13_30: + testq $2, M + jz .L13_40 + + ALIGN_4 + +.L13_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L13_36 + ALIGN_4 + +.L13_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L13_32 + ALIGN_4 + +.L13_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_39 + + ALIGN_4 + +.L13_37: + + KERNEL2x12_SUB + + dec %rax + jne .L13_37 + ALIGN_4 + + +.L13_39: + + SAVE2x12 + + ALIGN_4 + +.L13_40: + testq $1, M + jz .L13_100 // to next 3 lines of N + + ALIGN_4 + +.L13_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L13_46 + + ALIGN_4 + +.L13_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L13_42 + ALIGN_4 + +.L13_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_49 + + ALIGN_4 + +.L13_47: + + KERNEL1x12_SUB + + dec %rax + jne .L13_47 + ALIGN_4 + + +.L13_49: + + SAVE1x12 + + ALIGN_4 + +.L13_100: + + decq J // j -- + jg .L12_01 + + + + +/**************************************************************************************************/ + +.L8_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $3, J // j = j / 8 + je .L4_0 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x8 + + movq K, %rax + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x8 + + movq K, %rax + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + + ALIGN_4 + +.L8_100: + + movq K, %rax + salq $3, %rax // * 8 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L8_10 + + + +/**************************************************************************************************/ + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + testq $4, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $8, %rdi + divq %rdi // N / 8 + movq %rax, Ndiv12 // N / 8 + movq %rdx, Nmod12 // N % 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +/*************************************************************************************************/ +.L8_0: + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 8 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x8 + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x8 + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L8_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK // number of values in B +#endif + + + decq J // j -- + jg .L8_10 + + + + + +/*************************************************************************************************/ +.L4_0: + movq Nmod12, J + testq $4, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif