bugfix for zgemv_n_microk_haswell-2.c
This commit is contained in:
parent
9528f0d9ee
commit
11e34ddd1b
|
@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c
|
|||
DGEMVNKERNEL = dgemv_n.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
|
||||
ifndef OS_WINDOWS
|
||||
ZGEMVNKERNEL = zgemv_n.c
|
||||
endif
|
||||
ZGEMVTKERNEL = zgemv_t.c
|
||||
#ZGEMVTKERNEL = zgemv_t.c
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
|
|
|
@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(HASWELL)
|
||||
|
@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
|||
BLASLONG n2;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
|
||||
#if 0
|
||||
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
|
||||
#endif
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
inc_x *= 2;
|
||||
|
|
|
@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
||||
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
|
||||
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
|
||||
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
|
||||
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
|
||||
"prefetcht0 192(%5,%0,8) \n\t"
|
||||
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
|
||||
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
|
||||
|
||||
"vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 192(%6,%0,8) \n\t"
|
||||
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
|
||||
|
@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 192(%3,%0,8) \n\t"
|
||||
"vmovups (%3,%0,8), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||
|
@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"prefetcht0 192(%3,%0,8) \n\t"
|
||||
"vmovups (%3,%0,8), %%ymm12 \n\t"
|
||||
"vmovups 32(%3,%0,8), %%ymm13 \n\t"
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
|
||||
#else
|
||||
"vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
|
||||
"vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||
|
|
Loading…
Reference in New Issue