diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S index e43dad4e7..f8a316b64 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -340,7 +340,7 @@ vmovsd %xmm0, ALPHA - salq $BASE_SHIFT, LDC + salq $BASE_SHIFT, LDC # LDC << 3 # LDC * 8 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET @@ -350,7 +350,7 @@ #endif #endif movq N, J - sarq $2, J # j = (n >> 2) + sarq $2, J # j = (n >> 2) # j = n / 4 jle .L40 ALIGN_4 @@ -434,104 +434,6 @@ #define PR2 24 .L12: - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 prefetcht0 PR1*SIZE(AO,%rax,4) prefetcht0 PR2*SIZE(AO,%rax,4) prefetcht0 PR1*SIZE(BO,%rax,4) @@ -986,15 +888,15 @@ jg .L01 ALIGN_4 -.L40: - testq $3, N - je .L999 +.L40: # N % 4 + testq $3, N # N % 4 == 3 + je .L999 # Jump to end if N % 4 == 0 - testq $2, N + testq $2, N # N % 4 == 2 je .L80 ALIGN_4 -.L41: +.L41: # N % 4 > 1 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK @@ -1002,14 +904,14 @@ movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc - movq A, AO # aoffset = a + movq A, AO # aoffset = a movq K, %rax - salq $BASE_SHIFT + 1, %rax + salq $BASE_SHIFT + 1, %rax # k << 4 leaq (B, %rax), BB movq M, I - sarq $2, I # i = (m >> 2) + sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 @@ -1063,12 +965,12 @@ je .L56 ALIGN_4 -.L52: +.L52: # Loop for (N % 4) == 2 vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 - vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 - vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 @@ -1076,15 +978,15 @@ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 - vmovups (AO, %rax, 4), %xmm0 - vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vmovups (AO, %rax, 4), %xmm0 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 vmovddup -11 * SIZE(BO, %rax, 2), %xmm5 - vmovups -6 * SIZE(AO, %rax, 4), %xmm2 + vmovups -6 * SIZE(AO, %rax, 4), %xmm2 vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 - vmovups -4 * SIZE(AO, %rax, 4), %xmm4 + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 vmovddup -10 * SIZE(BO, %rax, 2), %xmm3 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 vmovddup -9 * SIZE(BO, %rax, 2), %xmm5 @@ -1093,7 +995,7 @@ vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 - vmovups 8 * SIZE(AO, %rax, 4), %xmm4 + vmovups 8 * SIZE(AO, %rax, 4), %xmm4 vmovddup -4 * SIZE(BO, %rax, 2), %xmm3 vmovddup -7 * SIZE(BO, %rax, 2), %xmm5 vmovaps %xmm0, %xmm2 @@ -1455,8 +1357,8 @@ ALIGN_4 .L80: - testq $1, N - je .L999 + testq $1, N # N % 4 == 1 + je .L999 # Jump to end if N % 4 == 0 ALIGN_4 .L81: