bugfix for zgemv_n_microk_haswell-2.c
This commit is contained in:
parent
9528f0d9ee
commit
11e34ddd1b
|
@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c
|
||||||
DGEMVNKERNEL = dgemv_n.c
|
DGEMVNKERNEL = dgemv_n.c
|
||||||
DGEMVTKERNEL = dgemv_t.c
|
DGEMVTKERNEL = dgemv_t.c
|
||||||
|
|
||||||
ifndef OS_WINDOWS
|
|
||||||
ZGEMVNKERNEL = zgemv_n.c
|
ZGEMVNKERNEL = zgemv_n.c
|
||||||
endif
|
#ZGEMVTKERNEL = zgemv_t.c
|
||||||
ZGEMVTKERNEL = zgemv_t.c
|
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
|
|
@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(HASWELL)
|
#if defined(HASWELL)
|
||||||
|
@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
BLASLONG n2;
|
BLASLONG n2;
|
||||||
FLOAT xbuffer[8],*ybuffer;
|
FLOAT xbuffer[8],*ybuffer;
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ( m < 1 ) return(0);
|
||||||
|
if ( n < 1 ) return(0);
|
||||||
|
|
||||||
ybuffer = buffer;
|
ybuffer = buffer;
|
||||||
|
|
||||||
inc_x *= 2;
|
inc_x *= 2;
|
||||||
|
|
|
@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||||
|
|
||||||
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
|
|
||||||
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
|
|
||||||
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
|
|
||||||
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
|
|
||||||
|
|
||||||
"prefetcht0 192(%5,%0,8) \n\t"
|
"prefetcht0 192(%5,%0,8) \n\t"
|
||||||
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
|
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
|
||||||
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
|
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
|
||||||
|
|
||||||
"vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||||
"vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||||
"vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||||
"vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||||
|
|
||||||
"prefetcht0 192(%6,%0,8) \n\t"
|
"prefetcht0 192(%6,%0,8) \n\t"
|
||||||
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
|
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
|
||||||
|
@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
"vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||||
"vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
"vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||||
|
|
||||||
|
"prefetcht0 192(%3,%0,8) \n\t"
|
||||||
|
"vmovups (%3,%0,8), %%ymm10 \n\t"
|
||||||
|
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||||
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||||
|
@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
"prefetcht0 192(%3,%0,8) \n\t"
|
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||||
"vmovups (%3,%0,8), %%ymm12 \n\t"
|
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||||
"vmovups 32(%3,%0,8), %%ymm13 \n\t"
|
|
||||||
|
|
||||||
#if !defined(XCONJ)
|
|
||||||
"vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
|
|
||||||
"vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
|
|
||||||
#else
|
|
||||||
"vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
|
|
||||||
"vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||||
|
|
Loading…
Reference in New Issue