bugfix for zgemv_n_microk_haswell-2.c

This commit is contained in:
wernsaar 2014-08-13 12:54:18 +02:00
parent 9528f0d9ee
commit 11e34ddd1b
3 changed files with 20 additions and 25 deletions

View File

@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c
DGEMVNKERNEL = dgemv_n.c DGEMVNKERNEL = dgemv_n.c
DGEMVTKERNEL = dgemv_t.c DGEMVTKERNEL = dgemv_t.c
ifndef OS_WINDOWS
ZGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c
endif #ZGEMVTKERNEL = zgemv_t.c
ZGEMVTKERNEL = zgemv_t.c
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c

View File

@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h" #include "common.h"
#if defined(HASWELL) #if defined(HASWELL)
@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG n2; BLASLONG n2;
FLOAT xbuffer[8],*ybuffer; FLOAT xbuffer[8],*ybuffer;
#if 0
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
#endif
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer; ybuffer = buffer;
inc_x *= 2; inc_x *= 2;

View File

@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
"prefetcht0 192(%5,%0,8) \n\t" "prefetcht0 192(%5,%0,8) \n\t"
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
"vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 192(%6,%0,8) \n\t" "prefetcht0 192(%6,%0,8) \n\t"
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 192(%3,%0,8) \n\t"
"vmovups (%3,%0,8), %%ymm10 \n\t"
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
#endif #endif
"prefetcht0 192(%3,%0,8) \n\t" "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
"vmovups (%3,%0,8), %%ymm12 \n\t" "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
"vmovups 32(%3,%0,8), %%ymm13 \n\t"
#if !defined(XCONJ)
"vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
"vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
#else
"vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
"vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
#endif
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
"vmovups %%ymm13, 32(%3,%0,8) \n\t" "vmovups %%ymm13, 32(%3,%0,8) \n\t"