diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 5bacb7da8..c72febe3d 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -89,17 +89,23 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define NN 4+STACKSIZE(%esp) +#define AA 8+STACKSIZE(%esp) +#define LDAX 12+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -114,6 +120,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -122,6 +129,37 @@ PROFCODE movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $23,J # J=2^22 + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA + movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -628,10 +666,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index c7e685dd8..d46d7e43e 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -76,18 +76,24 @@ #endif #define STACKSIZE 16 +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define AA 4+STACKSIZE(%esp) +#define LDAX 8+STACKSIZE(%esp) +#define NN 12+STACKSIZE(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) - #define I %eax #define J %ebx @@ -101,6 +107,8 @@ PROLOGUE + subl $ARGS,%esp + pushl %ebp pushl %edi pushl %esi @@ -108,7 +116,38 @@ PROFCODE + movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $22,J # J=2^22 + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -117,6 +156,7 @@ leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA + subl $-16 * SIZE, A cmpl $0, N @@ -560,10 +600,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 052ff1a79..06970a055 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -47,7 +47,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,6 +57,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else @@ -71,6 +75,10 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +#defien MMM 216(%rsp) +#defien NN 224(%rsp) +#define AA 232(%rsp) +#define LDAX 240(%rsp) #endif @@ -127,29 +135,46 @@ movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX movq OLD_X, X #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX #endif - - movq STACK_INCX, INCX - movq STACK_Y, Y - movq STACK_INCY, INCY - movq STACK_BUFFER, BUFFER - #ifndef WINDOWS_ABI pshufd $0, %xmm0, ALPHA #else pshufd $0, %xmm3, ALPHA #endif + +.L0t: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00t + ALIGN_4 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00t: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA @@ -6341,6 +6366,12 @@ ALIGN_4 .L999: + leaq (,M,SIZE),%rax + addq %rax,AA + jmp .L0t + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12