Merge pull request #198 from wernsaar/develop

new optimization of dgemm kernel for bulldozer: 10% performance increase
This commit is contained in:
Zhang Xianyi 2013-03-06 13:39:53 -08:00
commit 2c9a203bd1
1 changed files with 20 additions and 118 deletions

View File

@ -340,7 +340,7 @@
vmovsd %xmm0, ALPHA
salq $BASE_SHIFT, LDC
salq $BASE_SHIFT, LDC # LDC << 3 # LDC * 8
#ifdef TRMMKERNEL
vmovsd %xmm12, OFFSET
@ -350,7 +350,7 @@
#endif
#endif
movq N, J
sarq $2, J # j = (n >> 2)
sarq $2, J # j = (n >> 2) # j = n / 4
jle .L40
ALIGN_4
@ -434,104 +434,6 @@
#define PR2 24
.L12:
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
prefetcht0 PR2*SIZE(BO,%rax,4)
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
KERNEL4(16 * 0)
KERNEL5(16 * 0)
KERNEL6(16 * 0)
KERNEL7(16 * 0)
KERNEL8(16 * 0)
NOBRANCH
je .L15
prefetcht0 PR1*SIZE(AO,%rax,4)
prefetcht0 PR2*SIZE(AO,%rax,4)
prefetcht0 PR1*SIZE(BO,%rax,4)
@ -986,15 +888,15 @@
jg .L01
ALIGN_4
.L40:
testq $3, N
je .L999
.L40: # N % 4
testq $3, N # N % 4 == 3
je .L999 # Jump to end if N % 4 == 0
testq $2, N
testq $2, N # N % 4 == 2
je .L80
ALIGN_4
.L41:
.L41: # N % 4 > 1
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
@ -1005,7 +907,7 @@
movq A, AO # aoffset = a
movq K, %rax
salq $BASE_SHIFT + 1, %rax
salq $BASE_SHIFT + 1, %rax # k << 4
leaq (B, %rax), BB
movq M, I
@ -1063,7 +965,7 @@
je .L56
ALIGN_4
.L52:
.L52: # Loop for (N % 4) == 2
vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8
vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9
vmovups -14 * SIZE(AO, %rax, 4),%xmm2
@ -1455,8 +1357,8 @@
ALIGN_4
.L80:
testq $1, N
je .L999
testq $1, N # N % 4 == 1
je .L999 # Jump to end if N % 4 == 0
ALIGN_4
.L81: