From 05bb391c3aa8d642de45fb5a4ee0d5fae974061d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 16 Dec 2013 20:31:17 +0800 Subject: [PATCH] Refs #330. Fixed the compatible issue with clang on Mac OSX. --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 420 ++++++++++---------- kernel/x86_64/dgemm_kernel_16x2_haswell.S | 46 +-- kernel/x86_64/dgemm_kernel_4x4_haswell.S | 333 ++++++++-------- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 42 +- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 70 ++-- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 326 +++++++-------- 6 files changed, 618 insertions(+), 619 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index e4aba23e4..38c864ce5 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -110,22 +110,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif @@ -214,8 +214,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) - addq $4 , BI - addq $16, %rax + addq $ 4 , BI + addq $ 16, %rax .endm .macro SAVE8x2 @@ -224,10 +224,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes - vshufps $0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $0xb1, %ymm11, %ymm11, %ymm11 - vshufps $0xb1, %ymm13, %ymm13, %ymm13 - vshufps $0xb1, %ymm15, %ymm15, %ymm15 + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -237,10 +237,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 - vshufps $0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $0xb1, %ymm10, %ymm10, %ymm11 - vshufps $0xb1, %ymm12, %ymm12, %ymm13 - vshufps $0xb1, %ymm14, %ymm14, %ymm15 + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 #else vaddsubps %ymm8, %ymm9 ,%ymm9 @@ -254,10 +254,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %ymm15, %ymm14 // swap high and low 64 bytes - vshufps $0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $0xb1, %ymm11, %ymm11, %ymm11 - vshufps $0xb1, %ymm13, %ymm13, %ymm13 - vshufps $0xb1, %ymm15, %ymm15, %ymm15 + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 #endif @@ -318,8 +318,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) - addq $4, BI - addq $8, %rax + addq $ 4, BI + addq $ 8, %rax .endm .macro SAVE4x2 @@ -328,10 +328,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -341,10 +341,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -358,10 +358,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm15, %xmm14 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 #endif @@ -412,8 +412,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $4, BI - addq $4, %rax + addq $ 4, BI + addq $ 4, %rax .endm .macro SAVE2x2 @@ -422,8 +422,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 4 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -431,8 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -442,8 +442,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm11, %xmm10 // swap high and low 4 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif @@ -484,8 +484,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $4, BI - addq $2, %rax + addq $ 4, BI + addq $ 2, %rax .endm .macro SAVE1x2 @@ -494,8 +494,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -503,8 +503,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -514,8 +514,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm11, %xmm10 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif @@ -556,8 +556,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) - addq $2 , BI - addq $16, %rax + addq $ 2 , BI + addq $ 16, %rax .endm .macro SAVE8x1 @@ -566,8 +566,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes - vshufps $0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -575,8 +575,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 - vshufps $0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 #else vaddsubps %ymm8, %ymm9 ,%ymm9 @@ -586,8 +586,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %ymm13, %ymm12 // swap high and low 64 bytes - vshufps $0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 #endif @@ -628,8 +628,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - addq $2, BI - addq $8, %rax + addq $ 2, BI + addq $ 8, %rax .endm .macro SAVE4x1 @@ -638,8 +638,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 4 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -647,8 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -658,8 +658,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm13, %xmm12 // swap high and low 4 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 #endif @@ -694,8 +694,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $2, BI - addq $4, %rax + addq $ 2, BI + addq $ 4, %rax .endm .macro SAVE2x1 @@ -704,14 +704,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -719,7 +719,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm9, %xmm8 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #endif @@ -749,8 +749,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $2, BI - addq $2, %rax + addq $ 2, BI + addq $ 2, %rax .endm .macro SAVE1x1 @@ -759,14 +759,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -774,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm9, %xmm8 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #endif @@ -805,7 +805,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PROFCODE - subq $STACKSIZE, %rsp + subq $ STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -850,18 +850,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack STACK_TOUCH - cmpq $0, OLD_M + cmpq $ 0, OLD_M je .L999 - cmpq $0, OLD_N + cmpq $ 0, OLD_N je .L999 - cmpq $0, OLD_K + cmpq $ 0, OLD_K je .L999 movq OLD_M, M @@ -871,11 +871,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss %xmm0, ALPHA_R vmovss %xmm1, ALPHA_I - salq $ZBASE_SHIFT, LDC + salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx - movq $2, %rdi + movq $ 2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 @@ -893,7 +893,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_0: movq Ndiv6, J - cmpq $0, J + cmpq $ 0, J je .L1_0 ALIGN_4 @@ -910,8 +910,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups (BO1), %xmm0 vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO decq %rax jnz .L2_02b @@ -929,10 +929,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq A, AO // aoffset = a - addq $16 * SIZE, AO + addq $ 16 * SIZE, AO movq M, I - sarq $3, I // i = (m >> 3) + sarq $ 3, I // i = (m >> 3) je .L2_4_10 ALIGN_4 @@ -944,15 +944,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -967,20 +967,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $8, %rax // number of values in AO + addq $ 8, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_8_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1043,13 +1043,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_8_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1076,16 +1076,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK + addq $ 8, KK #endif - addq $16 * SIZE, CO1 # coffset += 16 + addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_8_11 ALIGN_4 @@ -1097,10 +1097,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_4_10: - testq $7, M + testq $ 7, M jz .L2_4_60 // to next 2 lines of N - testq $4, M + testq $ 4, M jz .L2_4_20 ALIGN_4 @@ -1111,15 +1111,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1134,20 +1134,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $4, %rax // number of values in AO + addq $ 4, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1202,13 +1202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1234,16 +1234,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK + addq $ 4, KK #endif - addq $8 * SIZE, CO1 # coffset += 8 + addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 @@ -1254,7 +1254,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_4_20: - testq $2, M + testq $ 2, M jz .L2_4_40 ALIGN_4 @@ -1264,15 +1264,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1287,20 +1287,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $2, %rax // number of values in AO + addq $ 2, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_26 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1351,13 +1351,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_4_29 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1378,8 +1378,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -1387,8 +1387,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 @@ -1398,8 +1398,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovaps %xmm11, %xmm10 // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif @@ -1437,16 +1437,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK + addq $ 2, KK #endif - addq $4 * SIZE, CO1 # coffset += 4 + addq $ 4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_4_21 ALIGN_4 @@ -1455,7 +1455,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /**************************************************************************/ .L2_4_40: - testq $1, M + testq $ 1, M jz .L2_4_60 // to next 2 lines of N ALIGN_4 @@ -1466,15 +1466,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1489,20 +1489,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $1, %rax // number of values in AO + addq $ 1, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1551,13 +1551,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_4_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1583,16 +1583,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK + addq $ 1, KK #endif - addq $2 * SIZE, CO1 # coffset += 2 + addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_4_41 ALIGN_4 @@ -1602,7 +1602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_4_60: #if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK + addq $ 2, KK #endif decq J // j -- @@ -1617,7 +1617,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************************************/ movq Nmod6, J - andq $1, J // j % 2 + andq $ 1, J // j % 2 je .L999 ALIGN_4 @@ -1632,8 +1632,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO decq %rax jnz .L1_02b @@ -1651,10 +1651,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq A, AO // aoffset = a - addq $16 * SIZE, AO + addq $ 16 * SIZE, AO movq M, I - sarq $3, I // i = (m >> 3) + sarq $ 3, I // i = (m >> 3) je .L1_4_10 ALIGN_4 @@ -1667,15 +1667,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1690,20 +1690,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $8, %rax // number of values in AO + addq $ 8, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_8_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1764,13 +1764,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_8_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1797,16 +1797,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax *16 ; number of values + salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK + addq $ 8, KK #endif - addq $16 * SIZE, CO1 # coffset += 16 + addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_8_11 ALIGN_4 @@ -1816,10 +1816,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /**************************************************************************************************/ .L1_4_10: - testq $7, M + testq $ 7, M jz .L999 - testq $4, M + testq $ 4, M jz .L1_4_20 @@ -1829,15 +1829,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1852,20 +1852,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $4, %rax // number of values in AO + addq $ 4, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1918,13 +1918,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1950,16 +1950,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK + addq $ 4, KK #endif - addq $8 * SIZE, CO1 # coffset += 8 + addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 @@ -1970,7 +1970,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_4_20: - testq $2, M + testq $ 2, M jz .L1_4_40 ALIGN_4 @@ -1980,15 +1980,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -2003,20 +2003,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $2, %rax // number of values in AO + addq $ 2, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_26 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -2065,13 +2065,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_4_29 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -2097,23 +2097,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK + addq $ 2, KK #endif - addq $4 * SIZE, CO1 # coffset += 4 + addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L1_4_40: - testq $1, M + testq $ 1, M jz .L999 // to next 2 lines of N ALIGN_4 @@ -2124,15 +2124,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -2147,20 +2147,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $1, %rax // number of values in AO + addq $ 1, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -2207,13 +2207,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_4_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -2241,16 +2241,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK + addq $ 1, KK #endif - addq $2 * SIZE, CO1 # coffset += 2 + addq $ 2 * SIZE, CO1 # coffset += 2 ALIGN_4 @@ -2278,7 +2278,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movups 208(%rsp), %xmm15 #endif - addq $STACKSIZE, %rsp + addq $ STACKSIZE, %rsp ret EPILOGUE diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index 2907a6871..98b582c0d 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -192,8 +192,8 @@ VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $3*SIZE , BO - addq $16*SIZE, AO + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO .endm @@ -212,8 +212,8 @@ VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 prefetcht0 B_PR1(BO) - addq $3*SIZE , BO - addq $8*SIZE, AO + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO .endm .macro KERNEL4x3_SUBN @@ -224,8 +224,8 @@ VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -10 * SIZE(BO), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $3*SIZE , BO - addq $4*SIZE, AO + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO .endm .macro KERNEL2x3_SUBN @@ -240,8 +240,8 @@ VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $3*SIZE , BO - addq $2*SIZE, AO + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO .endm .macro KERNEL1x3_SUBN @@ -252,8 +252,8 @@ VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -10 * SIZE(BO), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $3*SIZE , BO - addq $1*SIZE, AO + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO .endm @@ -1602,16 +1602,16 @@ vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $4, BI - addq $4, %rax + addq $ 4, BI + addq $ 4, %rax .endm .macro KERNEL1x1_SUB vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $1, BI - addq $1 , %rax + addq $ 1, BI + addq $ 1 , %rax .endm .macro SAVE1x1 @@ -1749,9 +1749,9 @@ vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO vmovups 0 * SIZE(BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm2 @@ -1769,9 +1769,9 @@ vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO decq %rax jnz .L6_01a_1 @@ -1792,9 +1792,9 @@ vmovsd 0 * SIZE(BO2), %xmm2 vmovups %xmm0, 0*SIZE(BO) vmovsd %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO decq %rax jnz .L6_02b diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S index 1bfb71572..d165d5da0 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -107,22 +107,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif @@ -168,17 +168,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 - addq $12*SIZE, BO + addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -197,16 +197,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -221,24 +221,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $8*SIZE, AO + addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups 4 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 vmovups 8 * SIZE(BO), %ymm3 - addq $24*SIZE, BO + addq $ 24*SIZE, BO .endm @@ -247,21 +247,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $8*SIZE, AO + addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $12*SIZE, BO + addq $ 12*SIZE, BO .endm .macro KERNEL4x12_SUB @@ -272,17 +272,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $12*SIZE, BO + addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $4*SIZE, AO + addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -309,23 +309,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - vpermpd $0xb1 , %ymm5, %ymm5 - vpermpd $0xb1 , %ymm7, %ymm7 + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 - vblendpd $0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $0x05, %ymm5, %ymm4, %ymm1 - vblendpd $0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $0x05, %ymm7, %ymm6, %ymm3 + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $0x1b , %ymm2, %ymm2 - vpermpd $0x1b , %ymm3, %ymm3 - vpermpd $0xb1 , %ymm2, %ymm2 - vpermpd $0xb1 , %ymm3, %ymm3 + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 - vblendpd $0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax @@ -349,23 +349,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $0xb1 , %ymm9 , %ymm9 - vpermpd $0xb1 , %ymm11, %ymm11 + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 - vblendpd $0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $0x05, %ymm11, %ymm10, %ymm3 + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $0x1b , %ymm2, %ymm2 - vpermpd $0x1b , %ymm3, %ymm3 - vpermpd $0xb1 , %ymm2, %ymm2 - vpermpd $0xb1 , %ymm3, %ymm3 + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 - vblendpd $0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 2), %rax @@ -390,23 +390,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) - vpermpd $0xb1 , %ymm13, %ymm13 - vpermpd $0xb1 , %ymm15, %ymm15 + vpermpd $ 0xb1 , %ymm13, %ymm13 + vpermpd $ 0xb1 , %ymm15, %ymm15 - vblendpd $0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $0x05, %ymm13, %ymm12, %ymm1 - vblendpd $0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $0x05, %ymm15, %ymm14, %ymm3 + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vpermpd $0x1b , %ymm2, %ymm2 - vpermpd $0x1b , %ymm3, %ymm3 - vpermpd $0xb1 , %ymm2, %ymm2 - vpermpd $0xb1 , %ymm3, %ymm3 + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 - vblendpd $0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 4), %rax @@ -431,7 +431,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) - addq $4*SIZE, CO1 + addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ @@ -477,9 +477,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %xmm0 ,%xmm3 , %xmm12 vmovddup -1 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $12*SIZE, BO + addq $ 12*SIZE, BO vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $2*SIZE, AO + addq $ 2*SIZE, AO vfmadd231pd %xmm0 ,%xmm3 , %xmm15 .endm @@ -557,7 +557,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm6 , (%rbp) vmovups %xmm7 , (%rbp, LDC) - addq $2*SIZE, CO1 + addq $ 2*SIZE, CO1 .endm @@ -604,9 +604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231sd %xmm0 ,%xmm3 , %xmm12 vmovsd -1 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $12*SIZE, BO + addq $ 12*SIZE, BO vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $1*SIZE, AO + addq $ 1*SIZE, AO vfmadd231sd %xmm0 ,%xmm3 , %xmm15 .endm @@ -684,7 +684,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd %xmm6 , (%rbp) vmovsd %xmm7 , (%rbp, LDC) - addq $1*SIZE, CO1 + addq $ 1*SIZE, CO1 .endm @@ -707,13 +707,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 - addq $4*SIZE, BO - vpermpd $0xb1, %ymm0 , %ymm0 + addq $ 4*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -723,12 +723,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -737,44 +737,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $8*SIZE, AO - vpermpd $0xb1, %ymm0 , %ymm0 + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 - addq $8*SIZE, BO + addq $ 8*SIZE, BO .endm .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $0x1b, %ymm0 , %ymm0 + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $8*SIZE, AO - vpermpd $0xb1, %ymm0 , %ymm0 + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $4*SIZE, BO + addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $0xb1, %ymm0 , %ymm0 + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $4*SIZE, BO - vpermpd $0x1b, %ymm0 , %ymm0 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $4*SIZE, AO - vpermpd $0xb1, %ymm0 , %ymm0 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -788,23 +788,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 - vpermpd $0xb1 , %ymm5, %ymm5 - vpermpd $0xb1 , %ymm7, %ymm7 + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 - vblendpd $0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $0x05, %ymm5, %ymm4, %ymm1 - vblendpd $0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $0x05, %ymm7, %ymm6, %ymm3 + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $0x1b , %ymm2, %ymm2 - vpermpd $0x1b , %ymm3, %ymm3 - vpermpd $0xb1 , %ymm2, %ymm2 - vpermpd $0xb1 , %ymm3, %ymm3 + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 - vblendpd $0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax @@ -823,7 +823,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - addq $4*SIZE, CO1 + addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ @@ -848,9 +848,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %xmm0 ,%xmm2 , %xmm5 vmovddup -9 * SIZE(BO), %xmm8 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $4*SIZE, BO + addq $ 4*SIZE, BO vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $2*SIZE, AO + addq $ 2*SIZE, AO .endm @@ -880,7 +880,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm6 , (%rax) vmovups %xmm7 , (%rax, LDC) - addq $2*SIZE, CO1 + addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ @@ -905,9 +905,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231sd %xmm0 ,%xmm2 , %xmm5 vmovsd -9 * SIZE(BO), %xmm8 vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $4*SIZE, BO + addq $ 4*SIZE, BO vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $1*SIZE, AO + addq $ 1*SIZE, AO .endm @@ -937,7 +937,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd %xmm6 , (%rax) vmovsd %xmm7 , (%rax, LDC) - addq $1*SIZE, CO1 + addq $ 1*SIZE, CO1 .endm @@ -963,8 +963,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %xmm1 ,%xmm2 , %xmm5 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $2*SIZE, BO - addq $4*SIZE, AO + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO .endm @@ -993,7 +993,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm6 , (CO1, LDC) vmovups %xmm7 , 2 * SIZE(CO1, LDC) - addq $4*SIZE, CO1 + addq $ 4*SIZE, CO1 .endm @@ -1014,8 +1014,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup -11 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $2*SIZE, BO - addq $2*SIZE, AO + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO .endm @@ -1038,7 +1038,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) - addq $2*SIZE, CO1 + addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ @@ -1058,8 +1058,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd -11 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $2*SIZE, BO - addq $1*SIZE, AO + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO .endm @@ -1082,7 +1082,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) - addq $1*SIZE, CO1 + addq $ 1*SIZE, CO1 .endm @@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -14 * SIZE(AO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - addq $1*SIZE, BO - addq $4*SIZE, AO + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO .endm @@ -1127,7 +1127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm4 , (CO1) vmovups %xmm5 , 2 * SIZE(CO1) - addq $4*SIZE, CO1 + addq $ 4*SIZE, CO1 .endm @@ -1145,8 +1145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $1*SIZE, BO - addq $2*SIZE, AO + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO .endm @@ -1166,7 +1166,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm4 , (CO1) - addq $2*SIZE, CO1 + addq $ 2*SIZE, CO1 .endm @@ -1184,8 +1184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $1*SIZE, BO - addq $1*SIZE, AO + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO .endm @@ -1205,7 +1205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd %xmm4 , (CO1) - addq $1*SIZE, CO1 + addq $ 1*SIZE, CO1 .endm @@ -1262,13 +1262,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. STACK_TOUCH - cmpq $0, OLD_M + cmpq $ 0, OLD_M je .L999 - cmpq $0, OLD_N + cmpq $ 0, OLD_N je .L999 - cmpq $0, OLD_K + cmpq $ 0, OLD_K je .L999 movq OLD_M, M @@ -1288,7 +1288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq Ndiv12, J - cmpq $0, J + cmpq $ 0, J je .L4_0 ALIGN_4 @@ -1330,10 +1330,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6, 16 * SIZE(BO) vmovups %ymm7, 20 * SIZE(BO) - addq $8 * SIZE ,BO1 - addq $8 * SIZE ,BO2 - addq $8 * SIZE ,BO3 - addq $24 *SIZE ,BO + addq $ 8 * SIZE ,BO1 + addq $ 8 * SIZE ,BO2 + addq $ 8 * SIZE ,BO3 + addq $ 24 *SIZE ,BO decq %rax jnz .L12_01a_1 @@ -1356,10 +1356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm1, 0 * SIZE(BO) vmovups %ymm2, 4 * SIZE(BO) vmovups %ymm3, 8 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO2 - addq $4*SIZE,BO3 - addq $12*SIZE,BO + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 4*SIZE,BO3 + addq $ 12*SIZE,BO decq %rax jnz .L12_02b @@ -1407,8 +1407,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subq $2, %rax je .L12_12a - .align 32 - + ALIGN_5 .L12_12: KERNEL4x12_M1 @@ -1621,7 +1620,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L4_0: - cmpq $0, Nmod12 // N % 12 == 0 + cmpq $ 0, Nmod12 // N % 12 == 0 je .L999 movq Nmod12, J @@ -1666,7 +1665,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subq $2, %rax je .L4_12a - .align 32 + ALIGN_5 .L4_12: @@ -1912,7 +1911,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - .align 32 + ALIGN_5 .L2_12: @@ -2108,7 +2107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sarq $3, %rax // K / 8 je .L1_16 - .align 32 + ALIGN_5 .L1_12: @@ -2362,13 +2361,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. STACK_TOUCH - cmpq $0, OLD_M + cmpq $ 0, OLD_M je .L999 - cmpq $0, OLD_N + cmpq $ 0, OLD_N je .L999 - cmpq $0, OLD_K + cmpq $ 0, OLD_K je .L999 movq OLD_M, M @@ -2397,7 +2396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq Ndiv12, J - cmpq $0, J + cmpq $ 0, J je .L2_0 ALIGN_4 @@ -2471,7 +2470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subq $2, %rax je .L4_12a - .align 32 + ALIGN_5 .L4_12: @@ -2848,7 +2847,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - .align 32 + ALIGN_5 .L2_12: @@ -3176,7 +3175,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sarq $3, %rax // K / 8 je .L1_16 - .align 32 + ALIGN_5 .L1_12: diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index 8585d45de..e09e3b3f5 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %xmm0 vmulpd %xmm1,%xmm0,%xmm10 vmulpd %xmm2,%xmm0,%xmm11 - addq $3*SIZE, BO + addq $ 3 * SIZE, BO vmulpd %xmm3,%xmm0,%xmm12 vmovups -10 * SIZE(AO), %xmm0 vmulpd %xmm1,%xmm0,%xmm13 @@ -294,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -3 * SIZE(BO), %xmm1 - addq $32 * SIZE, AO + addq $ 32 * SIZE, AO VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -2 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) @@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup 10 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) vmovddup 11 * SIZE(BO), %xmm3 - addq $32 * SIZE, AO - addq $24 * SIZE, BO + addq $ 32 * SIZE, AO + addq $ 24 * SIZE, BO .endm @@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $32*SIZE, AO + addq $ 32 * SIZE, AO VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $21*SIZE, BO + addq $ 21 * SIZE, BO VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm @@ -438,9 +438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $3*SIZE, BO + addq $ 3 * SIZE, BO VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $8*SIZE, AO + addq $ 8 * SIZE, AO VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm @@ -483,7 +483,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 C_PR1(CO1,LDC) prefetcht0 C_PR1(CO1,LDC,2) - addq $8 * SIZE, CO1 # coffset += 8 + addq $ 8 * SIZE, CO1 # coffset += 8 .endm @@ -1165,9 +1165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO decq %rax jnz .L6_02 @@ -1184,9 +1184,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd (BO2), %xmm1 vmovups %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO decq %rax jnz .L6_02b @@ -1223,9 +1223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm4, 7*SIZE(BO) vmovsd %xmm7, 9*SIZE(BO) vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO decq %rax jnz .L6_03 @@ -1243,9 +1243,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups (BO2), %xmm1 vmovsd %xmm0, (BO) vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO decq %rax jnz .L6_03b diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 2f1434ffa..6c3cda022 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - addq $4 , BI - addq $16, %rax + addq $ 4 , BI + addq $ 16, %rax .endm .macro SAVE16x4 @@ -233,8 +233,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - addq $4 , BI - addq $8 , %rax + addq $ 4 , BI + addq $ 8 , %rax .endm .macro SAVE8x4 @@ -277,8 +277,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - addq $4 , BI - addq $4 , %rax + addq $ 4 , BI + addq $ 4 , %rax .endm .macro SAVE4x4 @@ -325,8 +325,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - addq $4 , BI - addq $2, %rax + addq $ 4 , BI + addq $ 2, %rax .endm .macro SAVE2x4 @@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - addq $4 , BI - addq $1, %rax + addq $ 4 , BI + addq $ 1, %rax .endm .macro SAVE1x4 @@ -432,8 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - addq $2 , BI - addq $16, %rax + addq $ 2 , BI + addq $ 16, %rax .endm .macro SAVE16x2 @@ -474,8 +474,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - addq $2 , BI - addq $8 , %rax + addq $ 2 , BI + addq $ 8 , %rax .endm .macro SAVE8x2 @@ -507,8 +507,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - addq $2 , BI - addq $4 , %rax + addq $ 2 , BI + addq $ 4 , %rax .endm .macro SAVE4x2 @@ -542,8 +542,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - addq $2 , BI - addq $2, %rax + addq $ 2 , BI + addq $ 2, %rax .endm .macro SAVE2x2 @@ -583,8 +583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - addq $2 , BI - addq $1, %rax + addq $ 2 , BI + addq $ 1, %rax .endm .macro SAVE1x2 @@ -619,8 +619,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - addq $1 , BI - addq $16, %rax + addq $ 1 , BI + addq $ 16, %rax .endm .macro SAVE16x1 @@ -649,8 +649,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - addq $1 , BI - addq $8 , %rax + addq $ 1 , BI + addq $ 8 , %rax .endm .macro SAVE8x1 @@ -677,8 +677,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - addq $1 , BI - addq $4 , %rax + addq $ 1 , BI + addq $ 4 , %rax .endm .macro SAVE4x1 @@ -706,8 +706,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - addq $1 , BI - addq $2, %rax + addq $ 1 , BI + addq $ 2 , %rax .endm .macro SAVE2x1 @@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - addq $1 , BI - addq $1, %rax + addq $ 1 , BI + addq $ 1 , %rax .endm .macro SAVE1x1 @@ -882,8 +882,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm2, 8*SIZE(BO) vmovups %xmm3,12*SIZE(BO) - addq $16*SIZE,BO1 - addq $16*SIZE,BO + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO decq %rax jnz .L4_01a @@ -899,8 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups (BO1), %xmm0 vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO decq %rax jnz .L4_02c @@ -919,7 +919,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq A, AO // aoffset = a - addq $16 * SIZE, AO + addq $ 16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index 1e6278466..515939df6 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -109,22 +109,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); + movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif @@ -212,8 +212,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) - addq $4, BI - addq $8, %rax + addq $ 4, BI + addq $ 8, %rax .endm .macro SAVE4x2 @@ -222,10 +222,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastsd ALPHA_I, %ymm1 // swap high and low 8 bytes - vshufpd $0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $0x05, %ymm11, %ymm11, %ymm11 - vshufpd $0x05, %ymm13, %ymm13, %ymm13 - vshufpd $0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -235,10 +235,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubpd %ymm13,%ymm12, %ymm12 vaddsubpd %ymm15,%ymm14, %ymm14 - vshufpd $0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $0x05, %ymm10, %ymm10, %ymm11 - vshufpd $0x05, %ymm12, %ymm12, %ymm13 - vshufpd $0x05, %ymm14, %ymm14, %ymm15 + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 #else vaddsubpd %ymm8, %ymm9 ,%ymm9 @@ -252,10 +252,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovapd %ymm15, %ymm14 // swap high and low 8 bytes - vshufpd $0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $0x05, %ymm11, %ymm11, %ymm11 - vshufpd $0x05, %ymm13, %ymm13, %ymm13 - vshufpd $0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 #endif @@ -316,8 +316,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) - addq $4, BI - addq $4, %rax + addq $ 4, BI + addq $ 4, %rax .endm .macro SAVE2x2 @@ -326,10 +326,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -339,10 +339,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 #else vaddsubpd %xmm8, %xmm9 ,%xmm9 @@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovapd %xmm15, %xmm14 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 #endif @@ -415,8 +415,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - addq $4, BI - addq $2, %rax + addq $ 4, BI + addq $ 2, %rax .endm .macro SAVE1x2 @@ -425,8 +425,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 #else vaddsubpd %xmm8, %xmm9, %xmm9 @@ -445,8 +445,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovapd %xmm11, %xmm10 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 #endif @@ -486,8 +486,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - addq $2, BI - addq $8, %rax + addq $ 2, BI + addq $ 8, %rax .endm .macro SAVE4x1 @@ -496,8 +496,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastsd ALPHA_I, %ymm1 // swap high and low 8 bytes - vshufpd $0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -505,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm13,%ymm12 , %ymm12 - vshufpd $0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 #else vaddsubpd %ymm8, %ymm9 , %ymm9 @@ -516,8 +516,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovapd %ymm13, %ymm12 // swap high and low 8 bytes - vshufpd $0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 #endif @@ -559,8 +559,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - addq $2, BI - addq $4, %rax + addq $ 2, BI + addq $ 4, %rax .endm .macro SAVE2x1 @@ -569,8 +569,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) @@ -578,8 +578,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13,%xmm12 , %xmm12 - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 #else vaddsubpd %xmm8, %xmm9 , %xmm9 @@ -589,8 +589,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovapd %xmm13, %xmm12 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 #endif @@ -626,8 +626,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - addq $2, BI - addq $2, %rax + addq $ 2, BI + addq $ 2, %rax .endm .macro SAVE1x1 @@ -636,14 +636,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8, %xmm8 - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 #else vaddsubpd %xmm8, %xmm9, %xmm9 @@ -651,7 +651,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovapd %xmm9, %xmm8 // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 #endif @@ -682,7 +682,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PROFCODE - subq $STACKSIZE, %rsp + subq $ STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -727,18 +727,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack STACK_TOUCH - cmpq $0, OLD_M + cmpq $ 0, OLD_M je .L999 - cmpq $0, OLD_N + cmpq $ 0, OLD_N je .L999 - cmpq $0, OLD_K + cmpq $ 0, OLD_K je .L999 movq OLD_M, M @@ -748,11 +748,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovsd %xmm0, ALPHA_R vmovsd %xmm1, ALPHA_I - salq $ZBASE_SHIFT, LDC + salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx - movq $2, %rdi + movq $ 2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 @@ -770,7 +770,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_00_0: movq Ndiv6, J - cmpq $0, J + cmpq $ 0, J je .L1_2_0 ALIGN_4 @@ -789,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 2 * SIZE(BO1), %xmm1 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO decq %rax jnz .L2_00_02b @@ -809,10 +809,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq A, AO // aoffset = a - addq $8 * SIZE, AO + addq $ 8 * SIZE, AO movq M, I - sarq $2, I // i = (m >> 2) + sarq $ 2, I // i = (m >> 2) je .L2_2_10 ALIGN_4 @@ -825,15 +825,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -848,20 +848,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $4, %rax // number of values in AO + addq $ 4, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -928,13 +928,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -960,16 +960,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK + addq $ 4, KK #endif - addq $8 * SIZE, CO1 # coffset += 8 + addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_4_11 ALIGN_4 @@ -982,7 +982,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************************************************/ .L2_2_10: - testq $2, M + testq $ 2, M jz .L2_2_40 // to next 2 lines of N .L2_2_11: @@ -991,15 +991,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1014,20 +1014,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $2, %rax // number of values in AO + addq $ 2, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1086,13 +1086,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1118,16 +1118,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK + addq $ 2, KK #endif - addq $4 * SIZE, CO1 # coffset += 4 + addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 @@ -1135,7 +1135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Rest of M ***************************************************************************/ .L2_2_40: - testq $1, M + testq $ 1, M jz .L2_2_60 // to next 2 lines of N ALIGN_4 @@ -1146,15 +1146,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO + addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1169,20 +1169,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $1, %rax // number of values in AO + addq $ 1, %rax // number of values in AO #else - addq $2, %rax // number of values in BO + addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L2_2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1237,13 +1237,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L2_2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1269,16 +1269,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK + addq $ 1, KK #endif - addq $2 * SIZE, CO1 # coffset += 2 + addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_2_41 ALIGN_4 @@ -1288,7 +1288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_2_60: #if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK + addq $ 2, KK #endif decq J // j -- @@ -1303,7 +1303,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************************************/ movq Nmod6, J - andq $1, J // j % 2 + andq $ 1, J // j % 2 je .L999 ALIGN_4 @@ -1318,8 +1318,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups (BO1), %xmm0 vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO decq %rax jnz .L1_00_02b @@ -1337,10 +1337,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movq A, AO // aoffset = a - addq $8 * SIZE, AO + addq $ 8 * SIZE, AO movq M, I - sarq $2, I // i = (m >> 2) + sarq $ 2, I // i = (m >> 2) je .L1_2_10 ALIGN_4 @@ -1354,15 +1354,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1377,20 +1377,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $4, %rax // number of values in AO + addq $ 4, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1433,13 +1433,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1466,16 +1466,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values + salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK + addq $ 4, KK #endif - addq $8 * SIZE, CO1 # coffset += 8 + addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_4_11 ALIGN_4 @@ -1485,7 +1485,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*******************************************************************************************************/ .L1_2_10: - testq $2, M + testq $ 2, M jz .L1_2_40 @@ -1495,15 +1495,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1518,20 +1518,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $2, %rax // number of values in AO + addq $ 2, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_2_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1583,13 +1583,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_2_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1615,16 +1615,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values + salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK + addq $ 2, KK #endif - addq $4 * SIZE, CO1 # coffset += 4 + addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 @@ -1633,7 +1633,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Rest of M ***************************************************************************/ .L1_2_40: - testq $1, M + testq $ 1, M jz .L999 ALIGN_4 @@ -1644,15 +1644,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO + addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif @@ -1667,20 +1667,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else movq KK, %rax #ifdef LEFT - addq $1, %rax // number of values in AO + addq $ 1, %rax // number of values in AO #else - addq $1, %rax // number of values in BO + addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif - andq $-8, %rax // K = K - ( K % 8 ) + andq $ -8, %rax // K = K - ( K % 8 ) je .L1_2_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1731,13 +1731,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq KKK, %rax #endif - andq $7, %rax # if (k & 1) + andq $ 7, %rax # if (k & 1) je .L1_2_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI @@ -1763,16 +1763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values + salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK + addq $ 1, KK #endif - addq $2 * SIZE, CO1 # coffset += 2 + addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L1_2_41 ALIGN_4 @@ -1806,7 +1806,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. movups 208(%rsp), %xmm15 #endif - addq $STACKSIZE, %rsp + addq $ STACKSIZE, %rsp ret EPILOGUE