x86_64: clobber all xmm registers after vzeroupper

As observed using GCC 10 using -march=native -ftree-vectorize
on Knights Landing, it is now smart enough to find clobbers inside
non-inlined static functions.

In particular, sgemv counted on a kernel to preserve the whole
%ymm2 register (since it was not in the clobber list), but the top
part was destroyed by vzeroupper. This caused many tests to fail.

This patch makes sure all xmm (and ymm/zmm by extension) registers
are listed as clobbered to avoid this happening, as most kernels
already did correctly in fact.
This commit is contained in:
Bart Oldeman 2020-10-20 02:16:47 +00:00
parent 8e20ab21c8
commit b073d759d0
22 changed files with 63 additions and 44 deletions

View File

@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",

View File

@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",

View File

@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -67,7 +67,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"

View File

@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[1]), // 5 "r" (ap[1]), // 5
"r" (alpha) // 6 "r" (alpha) // 6
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm6", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm8", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"%xmm12", "%xmm13",
"memory" "memory"
); );
} }

View File

@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm2", "%xmm3", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7 "r" (ap[3]), // 7
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6 "r" (ap[2]), // 6
"r" (ap[3]) // 7 "r" (ap[3]) // 7
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"

View File

@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"

View File

@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"r" (ap[3]), // 8 "r" (ap[3]), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7 "r" (ap[3]), // 7
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6 "r" (ap[2]), // 6
"r" (ap[3]) // 7 "r" (ap[3]) // 7
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",

View File

@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
return; return;
@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );