From 52e8b3824b76f5c06d6dd2c99b79faab8262cf8d Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 16 Jul 2019 22:10:27 +0800 Subject: [PATCH] Delete dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4753 ---------------------- 1 file changed, 4753 deletions(-) delete mode 100644 kernel/x86_64/dgemm_kernel_4x8_haswell.S diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S deleted file mode 100644 index c84b599ce..000000000 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ /dev/null @@ -1,4753 +0,0 @@ -/********************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ - - -.macro INIT4x8 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - -.endm - -.macro KERNEL4x8_I - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vmulpd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -4 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 0 * SIZE(BO), %ymm2 - addq $ 16*SIZE, BO -.endm - - -.macro KERNEL4x8_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - addq $ 8*SIZE, BO -.endm - -.macro KERNEL4x8_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 8*SIZE, BO - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - -.endm - - -.macro SAVE4x8 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL2x8_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 2*SIZE, AO - -.endm - -.macro SAVE2x8 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL1x8_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 1*SIZE, AO - -.endm - -.macro SAVE1x8 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - - -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $24, %rdi - divq %rdi // N / 24 - movq %rax, Ndiv12 // N / 24 - movq %rdx, Nmod12 // N % 24 - - - movq Ndiv12, J - cmpq $ 0, J - je .L8_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values from BO1 - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - movq BO2 , B - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm2 - vmovups 0 * SIZE(BO2), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - - -/**************************************************************************************************/ - -.L13_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values - movq B, BO2 - leaq (B,%rax, SIZE), BO3 // next offset to BO2 - leaq (BO3,%rax, SIZE), B // next offset to B - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - - -.L13_02b: - - vmovups 4 * SIZE(BO2), %ymm1 - vmovups 0 * SIZE(BO3), %ymm2 - vmovups 4 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO2 - addq $ 8*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L13_02b - - - -.L13_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L13_20 - - ALIGN_4 - -.L13_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L13_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L13_12a - - ALIGN_5 -.L13_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L13_12 - -.L13_12a: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - - -.L13_13: - - test $1, %rax - jz .L13_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - - -.L13_14: - - INIT4x12 - - -.L13_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_19 - - ALIGN_4 - -.L13_17: - - KERNEL4x12_SUB - - dec %rax - jne .L13_17 - ALIGN_4 - - -.L13_19: - - SAVE4x12 - - decq I # i -- - jne .L13_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L13_20: - // Test rest of M - - testq $3, M - jz .L13_100 // to next 16 lines of N - - -.L13_30: - testq $2, M - jz .L13_40 - - ALIGN_4 - -.L13_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L13_36 - ALIGN_4 - -.L13_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L13_32 - ALIGN_4 - -.L13_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_39 - - ALIGN_4 - -.L13_37: - - KERNEL2x12_SUB - - dec %rax - jne .L13_37 - ALIGN_4 - - -.L13_39: - - SAVE2x12 - - ALIGN_4 - -.L13_40: - testq $1, M - jz .L13_100 // to next 3 lines of N - - ALIGN_4 - -.L13_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L13_46 - - ALIGN_4 - -.L13_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L13_42 - ALIGN_4 - -.L13_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_49 - - ALIGN_4 - -.L13_47: - - KERNEL1x12_SUB - - dec %rax - jne .L13_47 - ALIGN_4 - - -.L13_49: - - SAVE1x12 - - ALIGN_4 - -.L13_100: - - decq J // j -- - jg .L12_01 - - - - -/**************************************************************************************************/ - -.L8_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $3, J // j = j / 8 - je .L4_0 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x8 - - movq K, %rax - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x8 - - movq K, %rax - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - - ALIGN_4 - -.L8_100: - - movq K, %rax - salq $3, %rax // * 8 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L8_10 - - - -/**************************************************************************************************/ - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - testq $4, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $8, %rdi - divq %rdi // N / 8 - movq %rax, Ndiv12 // N / 8 - movq %rdx, Nmod12 // N % 8 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -/*************************************************************************************************/ -.L8_0: - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 8 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x8 - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x8 - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L8_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $8, KK // number of values in B -#endif - - - decq J // j -- - jg .L8_10 - - - - - -/*************************************************************************************************/ -.L4_0: - movq Nmod12, J - testq $4, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif