Refs #173. Fixed overflow internal buffer bug of gemv_t on x86.
This commit is contained in:
parent
a4ee6f3915
commit
fd3046b32a
|
@ -89,17 +89,23 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 16
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA 16 + STACKSIZE(%esp)
|
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 20 + STACKSIZE(%esp)
|
#define A 20 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 24 + STACKSIZE(%esp)
|
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 28 + STACKSIZE(%esp)
|
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 32 + STACKSIZE(%esp)
|
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 36 + STACKSIZE(%esp)
|
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 40 + STACKSIZE(%esp)
|
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 44 + STACKSIZE(%esp)
|
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+STACKSIZE(%esp)
|
||||||
|
#define NN 4+STACKSIZE(%esp)
|
||||||
|
#define AA 8+STACKSIZE(%esp)
|
||||||
|
#define LDAX 12+STACKSIZE(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -114,6 +120,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -122,6 +129,37 @@
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
|
movl LDA,LDAX # backup LDA
|
||||||
|
movl N,J
|
||||||
|
movl J,NN # backup N
|
||||||
|
movl A,J
|
||||||
|
movl J,AA # backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM # mov M to MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $23,J # J=2^22
|
||||||
|
subl J,MMM # MMM=MMM-J
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A # mov AA to A
|
||||||
|
|
||||||
|
movl NN,%eax
|
||||||
|
movl %eax,N # reset N
|
||||||
|
|
||||||
|
|
||||||
|
movl LDAX, LDA # reset LDA
|
||||||
|
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
@ -628,10 +666,19 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,J
|
||||||
|
leal (,J,SIZE),%eax
|
||||||
|
addl %eax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -76,18 +76,24 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 16
|
||||||
|
|
||||||
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+STACKSIZE(%esp)
|
||||||
|
#define AA 4+STACKSIZE(%esp)
|
||||||
|
#define LDAX 8+STACKSIZE(%esp)
|
||||||
|
#define NN 12+STACKSIZE(%esp)
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
|
||||||
#define N 8 + STACKSIZE(%esp)
|
|
||||||
#define ALPHA 16 + STACKSIZE(%esp)
|
|
||||||
#define A 24 + STACKSIZE(%esp)
|
|
||||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
|
||||||
#define STACK_X 32 + STACKSIZE(%esp)
|
|
||||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
|
||||||
#define Y 40 + STACKSIZE(%esp)
|
|
||||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
|
||||||
#define BUFFER 48 + STACKSIZE(%esp)
|
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
|
||||||
|
@ -101,6 +107,8 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
|
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -108,7 +116,38 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
|
movl LDA,LDAX # backup LDA
|
||||||
|
movl N,J
|
||||||
|
movl J,NN # backup N
|
||||||
|
movl A,J
|
||||||
|
movl J,AA # backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM # mov M to MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $22,J # J=2^22
|
||||||
|
subl J,MMM # MMM=MMM-J
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A # mov AA to A
|
||||||
|
|
||||||
|
movl NN,%eax
|
||||||
|
movl %eax,N # reset N
|
||||||
|
|
||||||
|
|
||||||
|
movl LDAX, LDA # reset LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
@ -117,6 +156,7 @@
|
||||||
leal (,INCY, SIZE), INCY
|
leal (,INCY, SIZE), INCY
|
||||||
leal (,LDA, SIZE), LDA
|
leal (,LDA, SIZE), LDA
|
||||||
|
|
||||||
|
|
||||||
subl $-16 * SIZE, A
|
subl $-16 * SIZE, A
|
||||||
|
|
||||||
cmpl $0, N
|
cmpl $0, N
|
||||||
|
@ -560,10 +600,19 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,J
|
||||||
|
leal (,J,SIZE),%eax
|
||||||
|
addl %eax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -47,7 +47,7 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_M %rdi
|
#define OLD_M %rdi
|
||||||
#define OLD_N %rsi
|
#define OLD_N %rsi
|
||||||
|
@ -57,6 +57,10 @@
|
||||||
#define STACK_Y 16 + STACKSIZE(%rsp)
|
#define STACK_Y 16 + STACKSIZE(%rsp)
|
||||||
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
||||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||||
|
#define MMM 56(%rsp)
|
||||||
|
#define NN 64(%rsp)
|
||||||
|
#define AA 72(%rsp)
|
||||||
|
#define LDAX 80(%rsp)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -71,6 +75,10 @@
|
||||||
#define STACK_Y 72 + STACKSIZE(%rsp)
|
#define STACK_Y 72 + STACKSIZE(%rsp)
|
||||||
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
||||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||||
|
#defien MMM 216(%rsp)
|
||||||
|
#defien NN 224(%rsp)
|
||||||
|
#define AA 232(%rsp)
|
||||||
|
#define LDAX 240(%rsp)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -127,29 +135,46 @@
|
||||||
movups %xmm14, 192(%rsp)
|
movups %xmm14, 192(%rsp)
|
||||||
movups %xmm15, 208(%rsp)
|
movups %xmm15, 208(%rsp)
|
||||||
|
|
||||||
movq OLD_M, M
|
movq OLD_M, MMM
|
||||||
movq OLD_N, N
|
movq OLD_N, NN
|
||||||
movq OLD_A, A
|
movq OLD_A, AA
|
||||||
movq OLD_LDA, LDA
|
movq OLD_LDA, LDAX
|
||||||
movq OLD_X, X
|
movq OLD_X, X
|
||||||
#else
|
#else
|
||||||
movq OLD_M, M
|
movq OLD_M, MMM
|
||||||
movq OLD_N, N
|
movq OLD_N, NN
|
||||||
movq OLD_A, A
|
movq OLD_A, AA
|
||||||
movq OLD_LDA, LDA
|
movq OLD_LDA, LDAX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
movq STACK_INCX, INCX
|
|
||||||
movq STACK_Y, Y
|
|
||||||
movq STACK_INCY, INCY
|
|
||||||
movq STACK_BUFFER, BUFFER
|
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
pshufd $0, %xmm0, ALPHA
|
pshufd $0, %xmm0, ALPHA
|
||||||
#else
|
#else
|
||||||
pshufd $0, %xmm3, ALPHA
|
pshufd $0, %xmm3, ALPHA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
.L0t:
|
||||||
|
xorq M,M
|
||||||
|
addq $1,M
|
||||||
|
salq $22,M
|
||||||
|
subq M,MMM
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movq MMM,%rax
|
||||||
|
addq M,%rax
|
||||||
|
jle .L999x
|
||||||
|
movq %rax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq LDAX,LDA
|
||||||
|
movq NN,N
|
||||||
|
movq AA,A
|
||||||
|
movq STACK_INCX, INCX
|
||||||
|
movq STACK_Y, Y
|
||||||
|
movq STACK_INCY, INCY
|
||||||
|
movq STACK_BUFFER, BUFFER
|
||||||
|
|
||||||
leaq (,INCX, SIZE), INCX
|
leaq (,INCX, SIZE), INCX
|
||||||
leaq (,INCY, SIZE), INCY
|
leaq (,INCY, SIZE), INCY
|
||||||
leaq (,LDA, SIZE), LDA
|
leaq (,LDA, SIZE), LDA
|
||||||
|
@ -6341,6 +6366,12 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
leaq (,M,SIZE),%rax
|
||||||
|
addq %rax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
Loading…
Reference in New Issue