don't use this sgemv_n on Windows

This commit is contained in:
wernsaar 2014-07-19 07:15:34 +02:00
parent c8a4a56177
commit b3938fe371
4 changed files with 53 additions and 41 deletions

View File

@ -1,4 +1,9 @@
ifdef OS_WINDOWS
SGEMVNKERNEL = ../arm/gemv_n.c
else
SGEMVNKERNEL = sgemv_n_avx.c SGEMVNKERNEL = sgemv_n_avx.c
endif
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S

View File

@ -1,4 +1,9 @@
ifdef OS_WINDOWS
SGEMVNKERNEL = ../arm/gemv_n.c
else
SGEMVNKERNEL = sgemv_n_avx.c SGEMVNKERNEL = sgemv_n_avx.c
endif
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S

View File

@ -61,8 +61,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
FLOAT *y_ptr; FLOAT *y_ptr;
BLASLONG n1,n2; BLASLONG n1;
BLASLONG m1,m2; BLASLONG m1;
BLASLONG register m2;
BLASLONG register n2;
FLOAT *xbuffer,*ybuffer; FLOAT *xbuffer,*ybuffer;
xbuffer = buffer; xbuffer = buffer;
ybuffer = xbuffer + 2048 + 256; ybuffer = xbuffer + 2048 + 256;

View File

@ -25,13 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
float *pre = a + lda*3; float *pre = a + lda*3;
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
@ -103,10 +103,10 @@ static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
"m" (x), // 4 "m" (x), // 4
"m" (y), // 5 "m" (y), // 5
"m" (pre) // 6 "m" (pre) // 6
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"xmm0" , "xmm1", "%xmm0", "%xmm1",
"xmm8", "xmm9", "xmm10", "xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"xmm12", "xmm13", "xmm14", "xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@ -114,13 +114,13 @@ static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
float *pre = a + lda*3; float *pre = a + lda*3;
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
@ -190,21 +190,16 @@ static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x,
"m" (x), // 4 "m" (x), // 4
"m" (y), // 5 "m" (y), // 5
"m" (pre) // 6 "m" (pre) // 6
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
"xmm0" , "xmm1",
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
); );
} }
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
float *pre = a + lda*1; float *pre = a + lda*3;
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
@ -248,20 +243,21 @@ static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x,
"m" (x), // 4 "m" (x), // 4
"m" (y), // 5 "m" (y), // 5
"m" (pre) // 6 "m" (pre) // 6
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"xmm0" , "xmm1", "%xmm0", "%xmm1",
"xmm12", "xmm13", "xmm14", "xmm15", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
} }
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
@ -295,20 +291,21 @@ static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, f
"m" (lda), // 3 "m" (lda), // 3
"m" (x), // 4 "m" (x), // 4
"m" (y) // 5 "m" (y) // 5
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"xmm0" , "xmm1", "%xmm0", "%xmm1",
"xmm12", "xmm13", "xmm14", "xmm15", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
} }
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
@ -342,19 +339,20 @@ static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, f
"m" (lda), // 3 "m" (lda), // 3
"m" (x), // 4 "m" (x), // 4
"m" (y) // 5 "m" (y) // 5
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"xmm0" , "xmm1", "%xmm0", "%xmm1",
"xmm12", "xmm13", "xmm14", "xmm15", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
} }
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1 "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
@ -392,9 +390,10 @@ static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, f
"m" (lda), // 3 "m" (lda), // 3
"m" (x), // 4 "m" (x), // 4
"m" (y) // 5 "m" (y) // 5
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"xmm0" , "xmm1", "%xmm0", "%xmm1",
"xmm12", "xmm13", "xmm14", "xmm15", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@ -402,11 +401,11 @@ static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, f
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{ {
__asm __volatile __asm__ __volatile__
( (
"movq %0, %%rax\n\t" // n -> rax "movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1 "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
@ -440,9 +439,10 @@ static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, f
"m" (lda), // 3 "m" (lda), // 3
"m" (x), // 4 "m" (x), // 4
"m" (y) // 5 "m" (y) // 5
: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"xmm0" , "xmm1", "%xmm0", "%xmm1",
"xmm12", "xmm13", "xmm14", "xmm15", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );