bugfix for zgemv_n_microk_haswell-2.c

This commit is contained in:
wernsaar 2014-08-13 12:54:18 +02:00
parent 9528f0d9ee
commit 11e34ddd1b
3 changed files with 20 additions and 25 deletions

View File

@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c
DGEMVNKERNEL = dgemv_n.c
DGEMVTKERNEL = dgemv_t.c
ifndef OS_WINDOWS
ZGEMVNKERNEL = zgemv_n.c
endif
ZGEMVTKERNEL = zgemv_t.c
#ZGEMVTKERNEL = zgemv_t.c
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c

View File

@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#if defined(HASWELL)
@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG n2;
FLOAT xbuffer[8],*ybuffer;
#if 0
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
#endif
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
inc_x *= 2;

View File

@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
"prefetcht0 192(%5,%0,8) \n\t"
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
"vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 192(%6,%0,8) \n\t"
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 192(%3,%0,8) \n\t"
"vmovups (%3,%0,8), %%ymm10 \n\t"
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
#endif
"prefetcht0 192(%3,%0,8) \n\t"
"vmovups (%3,%0,8), %%ymm12 \n\t"
"vmovups 32(%3,%0,8), %%ymm13 \n\t"
#if !defined(XCONJ)
"vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
"vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
#else
"vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
"vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
#endif
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
"vmovups %%ymm13, 32(%3,%0,8) \n\t"