Merge pull request #434 from wernsaar/develop
A lot of performance enhancements
This commit is contained in:
		
						commit
						c3cd6e7e32
					
				| 
						 | 
				
			
			@ -377,7 +377,7 @@ SGEMVNKERNEL = sgemv_n.c
 | 
			
		|||
endif
 | 
			
		||||
 | 
			
		||||
ifndef SGEMVTKERNEL
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
SGEMVTKERNEL = sgemv_t.c
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifndef DGEMVNKERNEL
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,14 +1,8 @@
 | 
			
		|||
ifdef OS_WINDOWS
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
else
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = sgemv_t_avx.c
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
SGEMVTKERNEL = sgemv_t.c
 | 
			
		||||
 | 
			
		||||
ZGEMVNKERNEL = zgemv_n_dup.S
 | 
			
		||||
ZGEMVTKERNEL = zgemv_t.S
 | 
			
		||||
ZGEMVTKERNEL = zgemv_t.c
 | 
			
		||||
 | 
			
		||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
 | 
			
		||||
DGEMVTKERNEL = dgemv_t_bulldozer.S
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,14 @@
 | 
			
		|||
ifdef OS_WINDOWS
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
else
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = sgemv_t_avx.c
 | 
			
		||||
endif
 | 
			
		||||
SGEMVTKERNEL = sgemv_t.c
 | 
			
		||||
 | 
			
		||||
DGEMVNKERNEL = dgemv_n.c
 | 
			
		||||
DGEMVTKERNEL = dgemv_t.c
 | 
			
		||||
 | 
			
		||||
ZGEMVNKERNEL = zgemv_n.c
 | 
			
		||||
ZGEMVTKERNEL = zgemv_t.c
 | 
			
		||||
 | 
			
		||||
CGEMVNKERNEL = cgemv_n.c
 | 
			
		||||
CGEMVTKERNEL = cgemv_t.c
 | 
			
		||||
 | 
			
		||||
SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 | 
			
		||||
SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,5 @@
 | 
			
		|||
ifdef OS_WINDOWS
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
else
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
SGEMVTKERNEL = sgemv_t.c
 | 
			
		||||
 | 
			
		||||
SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
 | 
			
		||||
SGEMMINCOPY    =  gemm_ncopy_4.S
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,5 @@
 | 
			
		|||
ifdef OS_WINDOWS
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
else
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = sgemv_t_avx.c
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
SGEMVTKERNEL = sgemv_t.c
 | 
			
		||||
 | 
			
		||||
ZGEMVNKERNEL = zgemv_n_dup.S
 | 
			
		||||
ZGEMVTKERNEL = zgemv_t.S
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,7 @@
 | 
			
		|||
ifdef OS_WINDOWS
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = ../arm/gemv_t.c
 | 
			
		||||
else
 | 
			
		||||
SGEMVNKERNEL = sgemv_n.c
 | 
			
		||||
SGEMVTKERNEL = sgemv_t_avx.c
 | 
			
		||||
endif
 | 
			
		||||
SGEMVTKERNEL = sgemv_t.c
 | 
			
		||||
 | 
			
		||||
ZGEMVNKERNEL = zgemv_n.c
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
SGEMMKERNEL    =  sgemm_kernel_16x4_sandy.S
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,255 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#if defined(HASWELL)
 | 
			
		||||
#include "cgemv_n_microk_haswell-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define NBMAX 2048
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
 | 
			
		||||
		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
 | 
			
		||||
		y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
 | 
			
		||||
		y[i]   += a2[i]*x[4] - a2[i+1] * x[5];
 | 
			
		||||
		y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
 | 
			
		||||
		y[i]   += a3[i]*x[6] - a3[i+1] * x[7];
 | 
			
		||||
		y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
 | 
			
		||||
#else 
 | 
			
		||||
		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
 | 
			
		||||
		y[i]   += a1[i]*x[2] + a1[i+1] * x[3];
 | 
			
		||||
		y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
 | 
			
		||||
		y[i]   += a2[i]*x[4] + a2[i+1] * x[5];
 | 
			
		||||
		y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
 | 
			
		||||
		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
 | 
			
		||||
		y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
 | 
			
		||||
#else 
 | 
			
		||||
		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
static void zero_y(BLASLONG n, FLOAT *dest)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	for ( i=0; i<2*n; i++ )
 | 
			
		||||
	{
 | 
			
		||||
		*dest = 0.0;
 | 
			
		||||
		dest++;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT temp_r;
 | 
			
		||||
	FLOAT temp_i;
 | 
			
		||||
	for ( i=0; i<n; i++ )
 | 
			
		||||
	{
 | 
			
		||||
#if !defined(XCONJ) 
 | 
			
		||||
		temp_r = alpha_r * src[0] - alpha_i * src[1];
 | 
			
		||||
		temp_i = alpha_r * src[1] + alpha_i * src[0];
 | 
			
		||||
#else
 | 
			
		||||
		temp_r =  alpha_r * src[0] + alpha_i * src[1];
 | 
			
		||||
		temp_i = -alpha_r * src[1] + alpha_i * src[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
		*dest += temp_r;
 | 
			
		||||
		*(dest+1) += temp_i;
 | 
			
		||||
 | 
			
		||||
		src+=2;
 | 
			
		||||
		dest += inc_dest;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[4];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT xbuffer[8],*ybuffer;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	if ( m < 1 ) return(0);
 | 
			
		||||
	if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
	ybuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	inc_x *= 2;
 | 
			
		||||
	inc_y *= 2;
 | 
			
		||||
	lda   *= 2;
 | 
			
		||||
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		zero_y(NB,ybuffer);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
 | 
			
		||||
			xbuffer[0] = x_ptr[0];
 | 
			
		||||
			xbuffer[1] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[2] = x_ptr[0];
 | 
			
		||||
			xbuffer[3] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[4] = x_ptr[0];
 | 
			
		||||
			xbuffer[5] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[6] = x_ptr[0];
 | 
			
		||||
			xbuffer[7] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			xbuffer[0] = x_ptr[0];
 | 
			
		||||
			xbuffer[1] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
 | 
			
		||||
		a     += 2 * NB;
 | 
			
		||||
		y_ptr += NB * inc_y;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	j=0;
 | 
			
		||||
	while ( j < (m % 16))
 | 
			
		||||
	{
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		FLOAT temp_r = 0.0;
 | 
			
		||||
		FLOAT temp_i = 0.0;
 | 
			
		||||
		for( i = 0; i < n; i++ )
 | 
			
		||||
		{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
 | 
			
		||||
			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
 | 
			
		||||
#else
 | 
			
		||||
			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
 | 
			
		||||
			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
			a_ptr += lda;
 | 
			
		||||
			x_ptr += inc_x;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ) 
 | 
			
		||||
		y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
 | 
			
		||||
		y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
 | 
			
		||||
#else
 | 
			
		||||
		y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
 | 
			
		||||
		y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
 | 
			
		||||
#endif
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		a+=2;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,137 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vbroadcastss	  (%2), %%ymm0                  \n\t"  // real part x0
 | 
			
		||||
	"vbroadcastss	 4(%2), %%ymm1                  \n\t"  // imag part x0
 | 
			
		||||
	"vbroadcastss	 8(%2), %%ymm2                  \n\t"  // real part x1
 | 
			
		||||
	"vbroadcastss	12(%2), %%ymm3                  \n\t"  // imag part x1
 | 
			
		||||
	"vbroadcastss	16(%2), %%ymm4                  \n\t"  // real part x2
 | 
			
		||||
	"vbroadcastss	20(%2), %%ymm5                  \n\t"  // imag part x2
 | 
			
		||||
	"vbroadcastss	24(%2), %%ymm6                  \n\t"  // real part x3
 | 
			
		||||
	"vbroadcastss	28(%2), %%ymm7                  \n\t"  // imag part x3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"prefetcht0      320(%4,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      320(%5,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
 | 
			
		||||
	"vmovups      32(%5,%0,4), %%ymm11              \n\t" // 4 complex values form a1
 | 
			
		||||
 | 
			
		||||
	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vmulps      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vmulps      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      320(%6,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%6,%0,4), %%ymm8	        \n\t" // 4 complex values form a2
 | 
			
		||||
	"vmovups      32(%6,%0,4), %%ymm9	        \n\t" // 4 complex values form a2
 | 
			
		||||
 | 
			
		||||
	"vfmadd231ps      %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vfmadd231ps      %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vfmadd231ps      %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vfmadd231ps      %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      320(%7,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%7,%0,4), %%ymm10              \n\t" // 4 complex values form a3
 | 
			
		||||
	"vmovups      32(%7,%0,4), %%ymm11              \n\t" // 4 complex values form a3
 | 
			
		||||
 | 
			
		||||
	"vfmadd231ps      %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vfmadd231ps      %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vfmadd231ps      %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vfmadd231ps      %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"vfmadd231ps      %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vfmadd231ps      %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vfmadd231ps      %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vfmadd231ps      %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      320(%3,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
 | 
			
		||||
	"vmovups	32(%3,%0,4),  %%ymm11           \n\t"
 | 
			
		||||
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm15, %%ymm14, %%ymm9              \n\t"
 | 
			
		||||
#else
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm14, %%ymm15, %%ymm9              \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
 | 
			
		||||
        "vaddps         %%ymm9, %%ymm11, %%ymm13              \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
	"vmovups  %%ymm13, 32(%3,%0,4)		        \n\t"	
 | 
			
		||||
 | 
			
		||||
        "addq		$16, %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,265 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#if defined(HASWELL)
 | 
			
		||||
#include "cgemv_t_microk_haswell-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define NBMAX 2048
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
	FLOAT temp_r0 = 0.0;
 | 
			
		||||
	FLOAT temp_r1 = 0.0;
 | 
			
		||||
	FLOAT temp_r2 = 0.0;
 | 
			
		||||
	FLOAT temp_r3 = 0.0;
 | 
			
		||||
	FLOAT temp_i0 = 0.0;
 | 
			
		||||
	FLOAT temp_i1 = 0.0;
 | 
			
		||||
	FLOAT temp_i2 = 0.0;
 | 
			
		||||
	FLOAT temp_i3 = 0.0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		temp_r0 += a0[i]*x[i]   - a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];		
 | 
			
		||||
		temp_r1 += a1[i]*x[i]   - a1[i+1]*x[i+1];		
 | 
			
		||||
		temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];		
 | 
			
		||||
		temp_r2 += a2[i]*x[i]   - a2[i+1]*x[i+1];		
 | 
			
		||||
		temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];		
 | 
			
		||||
		temp_r3 += a3[i]*x[i]   - a3[i+1]*x[i+1];		
 | 
			
		||||
		temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];		
 | 
			
		||||
#else
 | 
			
		||||
		temp_r0 += a0[i]*x[i]   + a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];		
 | 
			
		||||
		temp_r1 += a1[i]*x[i]   + a1[i+1]*x[i+1];		
 | 
			
		||||
		temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];		
 | 
			
		||||
		temp_r2 += a2[i]*x[i]   + a2[i+1]*x[i+1];		
 | 
			
		||||
		temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];		
 | 
			
		||||
		temp_r3 += a3[i]*x[i]   + a3[i+1]*x[i+1];		
 | 
			
		||||
		temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];		
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
	y[0] = temp_r0;
 | 
			
		||||
	y[1] = temp_i0;
 | 
			
		||||
	y[2] = temp_r1;
 | 
			
		||||
	y[3] = temp_i1;
 | 
			
		||||
	y[4] = temp_r2;
 | 
			
		||||
	y[5] = temp_i2;
 | 
			
		||||
	y[6] = temp_r3;
 | 
			
		||||
	y[7] = temp_i3;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
	FLOAT temp_r = 0.0;
 | 
			
		||||
	FLOAT temp_i = 0.0;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		temp_r += a0[i]*x[i]   - a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];		
 | 
			
		||||
#else
 | 
			
		||||
		temp_r += a0[i]*x[i]   + a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];		
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
	*y      = temp_r;
 | 
			
		||||
	*(y+1)  = temp_i;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 | 
			
		||||
{
 | 
			
		||||
        BLASLONG i;
 | 
			
		||||
        for ( i=0; i<n; i++ )
 | 
			
		||||
        {
 | 
			
		||||
                *dest     = *src;
 | 
			
		||||
                *(dest+1) = *(src+1);
 | 
			
		||||
                dest+=2;
 | 
			
		||||
                src += inc_src;
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[8];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT ybuffer[8],*xbuffer;
 | 
			
		||||
 | 
			
		||||
        inc_x *= 2;
 | 
			
		||||
        inc_y *= 2;
 | 
			
		||||
        lda   *= 2;
 | 
			
		||||
 | 
			
		||||
	xbuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		y_ptr = y;
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		copy_x(NB,x_ptr,xbuffer,inc_x);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ)
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#else
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#endif
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ)
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#else
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		a += 2* NB;
 | 
			
		||||
		x += NB * inc_x;	
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	BLASLONG m3 = m % 16;
 | 
			
		||||
	if ( m3 == 0 ) return(0);
 | 
			
		||||
 | 
			
		||||
	x_ptr = x;
 | 
			
		||||
	copy_x(m3,x_ptr,xbuffer,inc_x);
 | 
			
		||||
	j=0;
 | 
			
		||||
	a_ptr = a;
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
	while ( j < n)
 | 
			
		||||
	{
 | 
			
		||||
		FLOAT temp_r = 0.0;
 | 
			
		||||
		FLOAT temp_i = 0.0;
 | 
			
		||||
		for( i = 0; i < m3*2; i+=2 )
 | 
			
		||||
		{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
			temp_r += a_ptr[i] * xbuffer[i]   - a_ptr[i+1] * xbuffer[i+1];
 | 
			
		||||
			temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
 | 
			
		||||
#else
 | 
			
		||||
			temp_r += a_ptr[i] * xbuffer[i]   + a_ptr[i+1] * xbuffer[i+1];
 | 
			
		||||
			temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
 | 
			
		||||
#endif
 | 
			
		||||
		}
 | 
			
		||||
		a_ptr += lda;
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ) 
 | 
			
		||||
                y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
 | 
			
		||||
                y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
 | 
			
		||||
#else
 | 
			
		||||
                y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
 | 
			
		||||
                y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,171 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary froms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary from must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
 | 
			
		||||
	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
 | 
			
		||||
	"vxorps		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
 | 
			
		||||
	"vxorps		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
 | 
			
		||||
	"vxorps		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
 | 
			
		||||
	"vxorps		%%ymm13, %%ymm13, %%ymm13	\n\t"
 | 
			
		||||
	"vxorps		%%ymm14, %%ymm14, %%ymm14	\n\t"
 | 
			
		||||
	"vxorps		%%ymm15, %%ymm15, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
        "prefetcht0      192(%4,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
        "prefetcht0      192(%5,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      192(%2,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
 | 
			
		||||
	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
 | 
			
		||||
	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
 | 
			
		||||
	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
 | 
			
		||||
	
 | 
			
		||||
        "prefetcht0      192(%6,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%6,%0,4), %%ymm6	        \n\t" // 4 complex values from a2
 | 
			
		||||
        "prefetcht0      192(%7,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%7,%0,4), %%ymm7               \n\t" // 4 complex values from a3
 | 
			
		||||
 | 
			
		||||
	"vfmadd231ps      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231ps      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231ps      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231ps      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
 | 
			
		||||
	"vmovups       32(%4,%0,4), %%ymm4	        \n\t" // 2 complex values from a0
 | 
			
		||||
	"vmovups       32(%5,%0,4), %%ymm5              \n\t" // 2 complex values from a1
 | 
			
		||||
 | 
			
		||||
	"vmovups	  32(%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
 | 
			
		||||
	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
 | 
			
		||||
	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
 | 
			
		||||
	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
 | 
			
		||||
 | 
			
		||||
	"vmovups       32(%6,%0,4), %%ymm6	        \n\t" // 2 complex values from a2
 | 
			
		||||
	"vmovups       32(%7,%0,4), %%ymm7              \n\t" // 2 complex values from a3
 | 
			
		||||
 | 
			
		||||
	"vfmadd231ps      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231ps      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231ps      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231ps      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231ps      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
 | 
			
		||||
        "addq		$16 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8  , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm11, %%ymm11               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm9 , %%ymm8, %%ymm8               \n\t" 
 | 
			
		||||
        "vaddsubps      %%ymm11, %%ymm10, %%ymm10             \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm13, %%ymm12, %%ymm12             \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm15, %%ymm14, %%ymm14             \n\t"
 | 
			
		||||
#else
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm8 , %%ymm9 , %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm10, %%ymm11, %%ymm10             \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm12, %%ymm13, %%ymm12             \n\t"
 | 
			
		||||
        "vaddsubps      %%ymm14, %%ymm15, %%ymm14             \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
 | 
			
		||||
	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
 | 
			
		||||
	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
 | 
			
		||||
	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
 | 
			
		||||
	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
 | 
			
		||||
	"vaddps		%%xmm12, %%xmm13, %%xmm12      \n\t"
 | 
			
		||||
	"vaddps		%%xmm14, %%xmm15, %%xmm14      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vshufpd        $0x1, %%xmm8 , %%xmm8 , %%xmm9   \n\t"
 | 
			
		||||
	"vshufpd        $0x1, %%xmm10, %%xmm10, %%xmm11  \n\t"
 | 
			
		||||
	"vshufpd        $0x1, %%xmm12, %%xmm12, %%xmm13  \n\t"
 | 
			
		||||
	"vshufpd        $0x1, %%xmm14, %%xmm14, %%xmm15  \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
 | 
			
		||||
	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
 | 
			
		||||
	"vaddps		%%xmm12, %%xmm13, %%xmm12      \n\t"
 | 
			
		||||
	"vaddps		%%xmm14, %%xmm15, %%xmm14      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovsd	%%xmm8 ,   (%3)			\n\t"
 | 
			
		||||
	"vmovsd	%%xmm10,  8(%3)			\n\t"
 | 
			
		||||
	"vmovsd	%%xmm12, 16(%3)			\n\t"
 | 
			
		||||
	"vmovsd	%%xmm14, 24(%3)			\n\t"
 | 
			
		||||
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,206 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if defined(HASWELL)
 | 
			
		||||
#include "dgemv_n_microk_haswell-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define NBMAX 2048
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< n; i+=4 )
 | 
			
		||||
	{
 | 
			
		||||
		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
 | 
			
		||||
		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
 | 
			
		||||
		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
 | 
			
		||||
		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< n; i+=4 )
 | 
			
		||||
	{
 | 
			
		||||
		y[i] += a0[i]*x[0];		
 | 
			
		||||
		y[i+1] += a0[i+1]*x[0];		
 | 
			
		||||
		y[i+2] += a0[i+2]*x[0];		
 | 
			
		||||
		y[i+3] += a0[i+3]*x[0];		
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
static void zero_y(BLASLONG n, FLOAT *dest)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	for ( i=0; i<n; i++ )
 | 
			
		||||
	{
 | 
			
		||||
		*dest = 0.0;
 | 
			
		||||
		dest++;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	if ( inc_dest == 1 )
 | 
			
		||||
	{
 | 
			
		||||
		for ( i=0; i<n; i+=4 )
 | 
			
		||||
		{
 | 
			
		||||
			dest[i] += src[i];
 | 
			
		||||
			dest[i+1] += src[i+1];
 | 
			
		||||
			dest[i+2] += src[i+2];
 | 
			
		||||
			dest[i+3] += src[i+3];
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
	else
 | 
			
		||||
	{
 | 
			
		||||
		for ( i=0; i<n; i++ )
 | 
			
		||||
		{
 | 
			
		||||
			*dest += *src;
 | 
			
		||||
			src++;
 | 
			
		||||
			dest += inc_dest;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[4];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT xbuffer[4],*ybuffer;
 | 
			
		||||
 | 
			
		||||
        if ( m < 1 ) return(0);
 | 
			
		||||
        if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
	ybuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		zero_y(NB,ybuffer);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			xbuffer[0] = alpha * x_ptr[0];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[1] = alpha * x_ptr[0];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[2] = alpha * x_ptr[0];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[3] = alpha * x_ptr[0];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			xbuffer[0] = alpha * x_ptr[0];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		add_y(NB,ybuffer,y_ptr,inc_y);
 | 
			
		||||
		a     += NB;
 | 
			
		||||
		y_ptr += NB * inc_y;
 | 
			
		||||
	}
 | 
			
		||||
	j=0;
 | 
			
		||||
	while ( j < (m % 16))
 | 
			
		||||
	{
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		FLOAT temp = 0.0;
 | 
			
		||||
		for( i = 0; i < n; i++ )
 | 
			
		||||
		{
 | 
			
		||||
			temp += a_ptr[0] * x_ptr[0];
 | 
			
		||||
			a_ptr += lda;
 | 
			
		||||
			x_ptr += inc_x;
 | 
			
		||||
		}
 | 
			
		||||
		y_ptr[0] += alpha * temp;
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		a++;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,89 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
 | 
			
		||||
	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
 | 
			
		||||
	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
 | 
			
		||||
	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"prefetcht0	 192(%3,%0,8)		 \n\t"
 | 
			
		||||
	"vmovups	(%3,%0,8), %%ymm4	 \n\t"	// 4 * y
 | 
			
		||||
	"vmovups      32(%3,%0,8), %%ymm5	 \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 192(%4,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 192(%5,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 192(%6,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 192(%7,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
 | 
			
		||||
 | 
			
		||||
	"vmovups  %%ymm4,   (%3,%0,8)		      \n\t"	// 4 * y
 | 
			
		||||
	"vmovups  %%ymm5, 32(%3,%0,8)		      \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
        "addq		$8 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm4", "%xmm5", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,191 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#if defined(HASWELL)
 | 
			
		||||
#include "dgemv_t_microk_haswell-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define NBMAX 2048
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
	FLOAT temp0 = 0.0;
 | 
			
		||||
	FLOAT temp1 = 0.0;
 | 
			
		||||
	FLOAT temp2 = 0.0;
 | 
			
		||||
	FLOAT temp3 = 0.0;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< n; i+=4 )
 | 
			
		||||
	{
 | 
			
		||||
		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 | 
			
		||||
		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
 | 
			
		||||
		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
 | 
			
		||||
		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
 | 
			
		||||
	}
 | 
			
		||||
	y[0] = temp0;
 | 
			
		||||
	y[1] = temp1;
 | 
			
		||||
	y[2] = temp2;
 | 
			
		||||
	y[3] = temp3;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
	FLOAT temp = 0.0;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< n; i+=4 )
 | 
			
		||||
	{
 | 
			
		||||
		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 | 
			
		||||
	}
 | 
			
		||||
	*y = temp;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 | 
			
		||||
{
 | 
			
		||||
        BLASLONG i;
 | 
			
		||||
        for ( i=0; i<n; i++ )
 | 
			
		||||
        {
 | 
			
		||||
                *dest = *src;
 | 
			
		||||
                dest++;
 | 
			
		||||
                src += inc_src;
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[4];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT ybuffer[4],*xbuffer;
 | 
			
		||||
 | 
			
		||||
        if ( m < 1 ) return(0);
 | 
			
		||||
        if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
	xbuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		y_ptr = y;
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		copy_x(NB,x_ptr,xbuffer,inc_x);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
			*y_ptr += ybuffer[0]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			*y_ptr += ybuffer[1]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			*y_ptr += ybuffer[2]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			*y_ptr += ybuffer[3]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
			*y_ptr += ybuffer[0]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		a += NB;
 | 
			
		||||
		x += NB * inc_x;	
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	BLASLONG m3 = m % 16;
 | 
			
		||||
	if ( m3 == 0 ) return(0);
 | 
			
		||||
	x_ptr = x;
 | 
			
		||||
	for ( i=0; i< m3; i++ )
 | 
			
		||||
	{
 | 
			
		||||
		xbuffer[i] = *x_ptr;
 | 
			
		||||
		x_ptr += inc_x;
 | 
			
		||||
	}
 | 
			
		||||
	j=0;
 | 
			
		||||
	a_ptr = a;
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
	while ( j < n)
 | 
			
		||||
	{
 | 
			
		||||
		FLOAT temp = 0.0;
 | 
			
		||||
		for( i = 0; i < m3; i++ )
 | 
			
		||||
		{
 | 
			
		||||
			temp += a_ptr[i] * xbuffer[i];
 | 
			
		||||
		}
 | 
			
		||||
		a_ptr += lda;
 | 
			
		||||
		y_ptr[0] += alpha * temp;
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,107 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			         \n\t"
 | 
			
		||||
	"vxorpd		%%ymm4 , %%ymm4, %%ymm4  \n\t"
 | 
			
		||||
	"vxorpd		%%ymm5 , %%ymm5, %%ymm5  \n\t"
 | 
			
		||||
	"vxorpd		%%ymm6 , %%ymm6, %%ymm6  \n\t"
 | 
			
		||||
	"vxorpd		%%ymm7 , %%ymm7, %%ymm7  \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"prefetcht0	 384(%2,%0,8)		 \n\t"
 | 
			
		||||
	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
 | 
			
		||||
	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%4,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%5,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%6,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
 | 
			
		||||
	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%7,%0,8)		       \n\t"
 | 
			
		||||
	"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6      \n\t" 
 | 
			
		||||
	"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7      \n\t" 
 | 
			
		||||
 | 
			
		||||
        "addq		$8 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddpd		%%xmm4, %%xmm12, %%xmm4       \n\t"
 | 
			
		||||
	"vaddpd		%%xmm5, %%xmm13, %%xmm5       \n\t"
 | 
			
		||||
	"vaddpd		%%xmm6, %%xmm14, %%xmm6       \n\t"
 | 
			
		||||
	"vaddpd		%%xmm7, %%xmm15, %%xmm7       \n\t"
 | 
			
		||||
 | 
			
		||||
        "vhaddpd        %%xmm4, %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "vhaddpd        %%xmm5, %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "vhaddpd        %%xmm6, %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "vhaddpd        %%xmm7, %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovsd         %%xmm4,    (%3)         \n\t"
 | 
			
		||||
        "vmovsd         %%xmm5,   8(%3)         \n\t"
 | 
			
		||||
        "vmovsd         %%xmm6,  16(%3)         \n\t"
 | 
			
		||||
        "vmovsd         %%xmm7,  24(%3)         \n\t"
 | 
			
		||||
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -131,6 +131,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 | 
			
		|||
	BLASLONG n2;
 | 
			
		||||
	FLOAT xbuffer[4],*ybuffer;
 | 
			
		||||
 | 
			
		||||
        if ( m < 1 ) return(0);
 | 
			
		||||
        if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
	ybuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		|||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		|||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		|||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		|||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,197 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
 | 
			
		||||
#include "sgemv_t_microk_bulldozer-2.c"
 | 
			
		||||
#elif defined(HASWELL)
 | 
			
		||||
#include "sgemv_t_microk_haswell-2.c"
 | 
			
		||||
#elif defined(SANDYBRIDGE)
 | 
			
		||||
#include "sgemv_t_microk_sandy-2.c"
 | 
			
		||||
#elif defined(NEHALEM)
 | 
			
		||||
#include "sgemv_t_microk_nehalem-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define NBMAX 4096
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
	FLOAT temp0 = 0.0;
 | 
			
		||||
	FLOAT temp1 = 0.0;
 | 
			
		||||
	FLOAT temp2 = 0.0;
 | 
			
		||||
	FLOAT temp3 = 0.0;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< n; i+=4 )
 | 
			
		||||
	{
 | 
			
		||||
		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 | 
			
		||||
		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
 | 
			
		||||
		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
 | 
			
		||||
		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
 | 
			
		||||
	}
 | 
			
		||||
	y[0] = temp0;
 | 
			
		||||
	y[1] = temp1;
 | 
			
		||||
	y[2] = temp2;
 | 
			
		||||
	y[3] = temp3;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
	FLOAT temp = 0.0;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< n; i+=4 )
 | 
			
		||||
	{
 | 
			
		||||
		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 | 
			
		||||
	}
 | 
			
		||||
	*y = temp;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 | 
			
		||||
{
 | 
			
		||||
        BLASLONG i;
 | 
			
		||||
        for ( i=0; i<n; i++ )
 | 
			
		||||
        {
 | 
			
		||||
                *dest = *src;
 | 
			
		||||
                dest++;
 | 
			
		||||
                src += inc_src;
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[4];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT ybuffer[4],*xbuffer;
 | 
			
		||||
 | 
			
		||||
        if ( m < 1 ) return(0);
 | 
			
		||||
        if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
	xbuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		y_ptr = y;
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		copy_x(NB,x_ptr,xbuffer,inc_x);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			sgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
			*y_ptr += ybuffer[0]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			*y_ptr += ybuffer[1]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			*y_ptr += ybuffer[2]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			*y_ptr += ybuffer[3]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			sgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
			*y_ptr += ybuffer[0]*alpha;
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		a += NB;
 | 
			
		||||
		x += NB * inc_x;	
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	BLASLONG m3 = m % 16;
 | 
			
		||||
	if ( m3 == 0 ) return(0);
 | 
			
		||||
	x_ptr = x;
 | 
			
		||||
	for ( i=0; i< m3; i++ )
 | 
			
		||||
	{
 | 
			
		||||
		xbuffer[i] = *x_ptr;
 | 
			
		||||
		x_ptr += inc_x;
 | 
			
		||||
	}
 | 
			
		||||
	j=0;
 | 
			
		||||
	a_ptr = a;
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
	while ( j < n)
 | 
			
		||||
	{
 | 
			
		||||
		FLOAT temp = 0.0;
 | 
			
		||||
		for( i = 0; i < m3; i++ )
 | 
			
		||||
		{
 | 
			
		||||
			temp += a_ptr[i] * xbuffer[i];
 | 
			
		||||
		}
 | 
			
		||||
		a_ptr += lda;
 | 
			
		||||
		y_ptr[0] += alpha * temp;
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,109 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vxorps		%%xmm4, %%xmm4, %%xmm4	 \n\t"
 | 
			
		||||
	"vxorps		%%xmm5, %%xmm5, %%xmm5	 \n\t"
 | 
			
		||||
	"vxorps		%%xmm6, %%xmm6, %%xmm6	 \n\t"
 | 
			
		||||
	"vxorps		%%xmm7, %%xmm7, %%xmm7	 \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
        "vmovups        (%2,%0,4), %%xmm12       \n\t"  // 4 * x
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%4,%0,4)		       \n\t"
 | 
			
		||||
	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
 | 
			
		||||
        "vmovups      16(%2,%0,4), %%xmm13       \n\t"  // 4 * x
 | 
			
		||||
	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%5,%0,4)		       \n\t"
 | 
			
		||||
	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
 | 
			
		||||
        "vmovups      32(%2,%0,4), %%xmm14       \n\t"  // 4 * x
 | 
			
		||||
	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%6,%0,4)		       \n\t"
 | 
			
		||||
	"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" 
 | 
			
		||||
        "vmovups      48(%2,%0,4), %%xmm15       \n\t"  // 4 * x
 | 
			
		||||
	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%7,%0,4)		       \n\t"
 | 
			
		||||
	"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm5, 48(%5,%0,4), %%xmm15, %%xmm5 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm6, 48(%6,%0,4), %%xmm15, %%xmm6 \n\t" 
 | 
			
		||||
	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
 | 
			
		||||
 | 
			
		||||
        "addq		$16, %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
 | 
			
		||||
 | 
			
		||||
	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovss		%%xmm4,    (%3)		\n\t"
 | 
			
		||||
	"vmovss		%%xmm5,   4(%3)		\n\t"
 | 
			
		||||
	"vmovss		%%xmm6,   8(%3)		\n\t"
 | 
			
		||||
	"vmovss		%%xmm7,  12(%3)		\n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm4", "%xmm5", 
 | 
			
		||||
	  "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,112 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			         \n\t"
 | 
			
		||||
	"vxorps		%%ymm4 , %%ymm4, %%ymm4  \n\t"
 | 
			
		||||
	"vxorps		%%ymm5 , %%ymm5, %%ymm5  \n\t"
 | 
			
		||||
	"vxorps		%%ymm6 , %%ymm6, %%ymm6  \n\t"
 | 
			
		||||
	"vxorps		%%ymm7 , %%ymm7, %%ymm7  \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"prefetcht0	 384(%2,%0,4)		 \n\t"
 | 
			
		||||
	"vmovups	(%2,%0,4), %%ymm12       \n\t"	// 8 * x
 | 
			
		||||
	"vmovups      32(%2,%0,4), %%ymm13       \n\t"	// 8 * x
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%4,%0,4)		       \n\t"
 | 
			
		||||
	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%5,%0,4)		       \n\t"
 | 
			
		||||
	"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4      \n\t" 
 | 
			
		||||
	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%6,%0,4)		       \n\t"
 | 
			
		||||
	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
 | 
			
		||||
	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
 | 
			
		||||
	"prefetcht0	 384(%7,%0,4)		       \n\t"
 | 
			
		||||
	"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6      \n\t" 
 | 
			
		||||
	"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7      \n\t" 
 | 
			
		||||
 | 
			
		||||
        "addq		$16, %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddps		%%xmm4, %%xmm12, %%xmm4       \n\t"
 | 
			
		||||
	"vaddps		%%xmm5, %%xmm13, %%xmm5       \n\t"
 | 
			
		||||
	"vaddps		%%xmm6, %%xmm14, %%xmm6       \n\t"
 | 
			
		||||
	"vaddps		%%xmm7, %%xmm15, %%xmm7       \n\t"
 | 
			
		||||
 | 
			
		||||
        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovss         %%xmm4,    (%3)         \n\t"
 | 
			
		||||
        "vmovss         %%xmm5,   4(%3)         \n\t"
 | 
			
		||||
        "vmovss         %%xmm6,   8(%3)         \n\t"
 | 
			
		||||
        "vmovss         %%xmm7,  12(%3)         \n\t"
 | 
			
		||||
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,159 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"xorps		%%xmm0 , %%xmm0	         \n\t"
 | 
			
		||||
	"xorps		%%xmm1 , %%xmm1	         \n\t"
 | 
			
		||||
	"xorps		%%xmm2 , %%xmm2	         \n\t"
 | 
			
		||||
	"xorps		%%xmm3 , %%xmm3	         \n\t"
 | 
			
		||||
	"xorps		%%xmm4 , %%xmm4	         \n\t"
 | 
			
		||||
	"xorps		%%xmm5 , %%xmm5	         \n\t"
 | 
			
		||||
	"xorps		%%xmm6 , %%xmm6	         \n\t"
 | 
			
		||||
	"xorps		%%xmm7 , %%xmm7	         \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"prefetcht0	 384(%2,%0,4)		       \n\t"
 | 
			
		||||
	"movups	       (%2,%0,4), %%xmm12        \n\t"	// 4 * x
 | 
			
		||||
	"movups      16(%2,%0,4), %%xmm13        \n\t"	// 4 * x
 | 
			
		||||
	"movups             (%4,%0,4), %%xmm8          \n\t" 
 | 
			
		||||
	"movups      32(%2,%0,4), %%xmm14        \n\t"	// 4 * x
 | 
			
		||||
	"movups      48(%2,%0,4), %%xmm15        \n\t"	// 4 * x
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%4,%0,4)		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups           16(%4,%0,4), %%xmm9          \n\t" 
 | 
			
		||||
	"movups           32(%4,%0,4), %%xmm10         \n\t" 
 | 
			
		||||
	"movups           48(%4,%0,4), %%xmm11         \n\t" 
 | 
			
		||||
	"mulps		%%xmm12, %%xmm8		       \n\t"
 | 
			
		||||
	"addps		%%xmm8 , %%xmm0		       \n\t"
 | 
			
		||||
	"mulps		%%xmm13, %%xmm9		       \n\t"
 | 
			
		||||
	"addps		%%xmm9 , %%xmm4		       \n\t"
 | 
			
		||||
	"movups             (%5,%0,4), %%xmm8          \n\t" 
 | 
			
		||||
	"mulps		%%xmm14, %%xmm10	       \n\t"
 | 
			
		||||
	"addps		%%xmm10, %%xmm0		       \n\t"
 | 
			
		||||
	"mulps		%%xmm15, %%xmm11	       \n\t"
 | 
			
		||||
	"addps		%%xmm11, %%xmm4		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%5,%0,4)		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups           16(%5,%0,4), %%xmm9          \n\t" 
 | 
			
		||||
	"movups           32(%5,%0,4), %%xmm10         \n\t" 
 | 
			
		||||
	"movups           48(%5,%0,4), %%xmm11         \n\t" 
 | 
			
		||||
	"mulps		%%xmm12, %%xmm8		       \n\t"
 | 
			
		||||
	"addps		%%xmm8 , %%xmm1		       \n\t"
 | 
			
		||||
	"mulps		%%xmm13, %%xmm9		       \n\t"
 | 
			
		||||
	"addps		%%xmm9 , %%xmm5		       \n\t"
 | 
			
		||||
	"movups             (%6,%0,4), %%xmm8          \n\t" 
 | 
			
		||||
	"mulps		%%xmm14, %%xmm10	       \n\t"
 | 
			
		||||
	"addps		%%xmm10, %%xmm1		       \n\t"
 | 
			
		||||
	"mulps		%%xmm15, %%xmm11	       \n\t"
 | 
			
		||||
	"addps		%%xmm11, %%xmm5		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%6,%0,4)		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups           16(%6,%0,4), %%xmm9          \n\t" 
 | 
			
		||||
	"movups           32(%6,%0,4), %%xmm10         \n\t" 
 | 
			
		||||
	"movups           48(%6,%0,4), %%xmm11         \n\t" 
 | 
			
		||||
	"mulps		%%xmm12, %%xmm8		       \n\t"
 | 
			
		||||
	"addps		%%xmm8 , %%xmm2		       \n\t"
 | 
			
		||||
	"mulps		%%xmm13, %%xmm9		       \n\t"
 | 
			
		||||
	"addps		%%xmm9 , %%xmm6		       \n\t"
 | 
			
		||||
	"movups             (%7,%0,4), %%xmm8          \n\t" 
 | 
			
		||||
	"mulps		%%xmm14, %%xmm10	       \n\t"
 | 
			
		||||
	"addps		%%xmm10, %%xmm2		       \n\t"
 | 
			
		||||
	"mulps		%%xmm15, %%xmm11	       \n\t"
 | 
			
		||||
	"addps		%%xmm11, %%xmm6		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%7,%0,4)		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups           16(%7,%0,4), %%xmm9          \n\t" 
 | 
			
		||||
	"movups           32(%7,%0,4), %%xmm10         \n\t" 
 | 
			
		||||
	"movups           48(%7,%0,4), %%xmm11         \n\t" 
 | 
			
		||||
	"mulps		%%xmm12, %%xmm8		       \n\t"
 | 
			
		||||
	"addps		%%xmm8 , %%xmm3		       \n\t"
 | 
			
		||||
	"mulps		%%xmm13, %%xmm9		       \n\t"
 | 
			
		||||
	"addps		%%xmm9 , %%xmm7		       \n\t"
 | 
			
		||||
	"mulps		%%xmm14, %%xmm10	       \n\t"
 | 
			
		||||
	"addps		%%xmm10, %%xmm3		       \n\t"
 | 
			
		||||
	"mulps		%%xmm15, %%xmm11	       \n\t"
 | 
			
		||||
	"addps		%%xmm11, %%xmm7		       \n\t"
 | 
			
		||||
 | 
			
		||||
        "addq		$16, %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"addps	       %%xmm0, %%xmm4		      \n\t"
 | 
			
		||||
	"addps	       %%xmm1, %%xmm5		      \n\t"
 | 
			
		||||
	"addps	       %%xmm2, %%xmm6		      \n\t"
 | 
			
		||||
	"addps	       %%xmm3, %%xmm7		      \n\t"
 | 
			
		||||
 | 
			
		||||
        "haddps        %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "haddps        %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "haddps        %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "haddps        %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "haddps        %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "haddps        %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "haddps        %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "haddps        %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "movss         %%xmm4,    (%3)         \n\t"
 | 
			
		||||
        "movss         %%xmm5,   4(%3)         \n\t"
 | 
			
		||||
        "movss         %%xmm6,   8(%3)         \n\t"
 | 
			
		||||
        "movss         %%xmm7,  12(%3)         \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,132 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
        "vxorps         %%ymm0 , %%ymm0, %%ymm0  \n\t"
 | 
			
		||||
        "vxorps         %%ymm1 , %%ymm1, %%ymm1  \n\t"
 | 
			
		||||
        "vxorps         %%ymm2 , %%ymm2, %%ymm2  \n\t"
 | 
			
		||||
        "vxorps         %%ymm3 , %%ymm3, %%ymm3  \n\t"
 | 
			
		||||
        "vxorps         %%ymm4 , %%ymm4, %%ymm4  \n\t"
 | 
			
		||||
        "vxorps         %%ymm5 , %%ymm5, %%ymm5  \n\t"
 | 
			
		||||
        "vxorps         %%ymm6 , %%ymm6, %%ymm6  \n\t"
 | 
			
		||||
        "vxorps         %%ymm7 , %%ymm7, %%ymm7  \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"prefetcht0	 384(%2,%0,4)		       \n\t"
 | 
			
		||||
        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
 | 
			
		||||
        "vmovups      32(%2,%0,4), %%ymm13       \n\t"  // 8 * x
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%4,%0,4)		       \n\t"
 | 
			
		||||
	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 | 
			
		||||
	"vmulps 32(%4,%0,4), %%ymm13, %%ymm9      \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm0, %%ymm9 , %%ymm0	  \n\t"
 | 
			
		||||
	"prefetcht0	 384(%5,%0,4)		       \n\t"
 | 
			
		||||
	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm1, %%ymm10, %%ymm1	  \n\t"
 | 
			
		||||
	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 | 
			
		||||
	"prefetcht0	 384(%6,%0,4)		       \n\t"
 | 
			
		||||
	"vmulps   (%6,%0,4), %%ymm12, %%ymm8      \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm6, %%ymm8 , %%ymm6	  \n\t"
 | 
			
		||||
	"vmulps 32(%6,%0,4), %%ymm13, %%ymm9      \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm2, %%ymm9 , %%ymm2	  \n\t"
 | 
			
		||||
	"prefetcht0	 384(%7,%0,4)		       \n\t"
 | 
			
		||||
	"vmulps   (%7,%0,4), %%ymm12, %%ymm10     \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm7, %%ymm10, %%ymm7	  \n\t"
 | 
			
		||||
	"vmulps 32(%7,%0,4), %%ymm13, %%ymm11     \n\t" 
 | 
			
		||||
	"vaddps	  %%ymm3, %%ymm11, %%ymm3	  \n\t"
 | 
			
		||||
 | 
			
		||||
        "addq		$16, %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
 | 
			
		||||
        "vaddps         %%ymm4, %%ymm0, %%ymm4       \n\t"
 | 
			
		||||
        "vaddps         %%ymm5, %%ymm1, %%ymm5       \n\t"
 | 
			
		||||
        "vaddps         %%ymm6, %%ymm2, %%ymm6       \n\t"
 | 
			
		||||
        "vaddps         %%ymm7, %%ymm3, %%ymm7       \n\t"
 | 
			
		||||
 | 
			
		||||
        "vextractf128   $1 , %%ymm4, %%xmm12          \n\t"
 | 
			
		||||
        "vextractf128   $1 , %%ymm5, %%xmm13          \n\t"
 | 
			
		||||
        "vextractf128   $1 , %%ymm6, %%xmm14          \n\t"
 | 
			
		||||
        "vextractf128   $1 , %%ymm7, %%xmm15          \n\t"
 | 
			
		||||
 | 
			
		||||
        "vaddps         %%xmm4, %%xmm12, %%xmm4       \n\t"
 | 
			
		||||
        "vaddps         %%xmm5, %%xmm13, %%xmm5       \n\t"
 | 
			
		||||
        "vaddps         %%xmm6, %%xmm14, %%xmm6       \n\t"
 | 
			
		||||
        "vaddps         %%xmm7, %%xmm15, %%xmm7       \n\t"
 | 
			
		||||
 | 
			
		||||
        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
 | 
			
		||||
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovss         %%xmm4,    (%3)         \n\t"
 | 
			
		||||
        "vmovss         %%xmm5,   4(%3)         \n\t"
 | 
			
		||||
        "vmovss         %%xmm6,   8(%3)         \n\t"
 | 
			
		||||
        "vmovss         %%xmm7,  12(%3)         \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,258 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#if defined(HASWELL)
 | 
			
		||||
#include "zgemv_n_microk_haswell-2.c"
 | 
			
		||||
#elif defined(SANDYBRIDGE)
 | 
			
		||||
#include "zgemv_n_microk_sandy-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define NBMAX 1024
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
 | 
			
		||||
		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
 | 
			
		||||
		y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
 | 
			
		||||
		y[i]   += a2[i]*x[4] - a2[i+1] * x[5];
 | 
			
		||||
		y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
 | 
			
		||||
		y[i]   += a3[i]*x[6] - a3[i+1] * x[7];
 | 
			
		||||
		y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
 | 
			
		||||
#else 
 | 
			
		||||
		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
 | 
			
		||||
		y[i]   += a1[i]*x[2] + a1[i+1] * x[3];
 | 
			
		||||
		y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
 | 
			
		||||
		y[i]   += a2[i]*x[4] + a2[i+1] * x[5];
 | 
			
		||||
		y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
 | 
			
		||||
		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
 | 
			
		||||
		y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
 | 
			
		||||
#else 
 | 
			
		||||
		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
 | 
			
		||||
		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
static void zero_y(BLASLONG n, FLOAT *dest)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	for ( i=0; i<2*n; i++ )
 | 
			
		||||
	{
 | 
			
		||||
		*dest = 0.0;
 | 
			
		||||
		dest++;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT temp_r;
 | 
			
		||||
	FLOAT temp_i;
 | 
			
		||||
	for ( i=0; i<n; i++ )
 | 
			
		||||
	{
 | 
			
		||||
#if !defined(XCONJ) 
 | 
			
		||||
		temp_r = alpha_r * src[0] - alpha_i * src[1];
 | 
			
		||||
		temp_i = alpha_r * src[1] + alpha_i * src[0];
 | 
			
		||||
#else
 | 
			
		||||
		temp_r =  alpha_r * src[0] + alpha_i * src[1];
 | 
			
		||||
		temp_i = -alpha_r * src[1] + alpha_i * src[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
		*dest += temp_r;
 | 
			
		||||
		*(dest+1) += temp_i;
 | 
			
		||||
 | 
			
		||||
		src+=2;
 | 
			
		||||
		dest += inc_dest;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[4];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT xbuffer[8],*ybuffer;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	if ( m < 1 ) return(0);
 | 
			
		||||
	if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
	ybuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	inc_x *= 2;
 | 
			
		||||
	inc_y *= 2;
 | 
			
		||||
	lda   *= 2;
 | 
			
		||||
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		zero_y(NB,ybuffer);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
 | 
			
		||||
			xbuffer[0] = x_ptr[0];
 | 
			
		||||
			xbuffer[1] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[2] = x_ptr[0];
 | 
			
		||||
			xbuffer[3] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[4] = x_ptr[0];
 | 
			
		||||
			xbuffer[5] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			xbuffer[6] = x_ptr[0];
 | 
			
		||||
			xbuffer[7] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			xbuffer[0] = x_ptr[0];
 | 
			
		||||
			xbuffer[1] = x_ptr[1];
 | 
			
		||||
			x_ptr += inc_x;	
 | 
			
		||||
			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
 | 
			
		||||
		a     += 2 * NB;
 | 
			
		||||
		y_ptr += NB * inc_y;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	j=0;
 | 
			
		||||
	while ( j < (m % 16))
 | 
			
		||||
	{
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		FLOAT temp_r = 0.0;
 | 
			
		||||
		FLOAT temp_i = 0.0;
 | 
			
		||||
		for( i = 0; i < n; i++ )
 | 
			
		||||
		{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
 | 
			
		||||
			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
 | 
			
		||||
#else
 | 
			
		||||
			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
 | 
			
		||||
			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
			a_ptr += lda;
 | 
			
		||||
			x_ptr += inc_x;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ) 
 | 
			
		||||
		y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
 | 
			
		||||
		y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
 | 
			
		||||
#else
 | 
			
		||||
		y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
 | 
			
		||||
		y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
 | 
			
		||||
#endif
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		a+=2;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,137 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
 | 
			
		||||
	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
 | 
			
		||||
	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
 | 
			
		||||
	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
 | 
			
		||||
	"vbroadcastsd	32(%2), %%ymm4                  \n\t"  // real part x2
 | 
			
		||||
	"vbroadcastsd	40(%2), %%ymm5                  \n\t"  // imag part x2
 | 
			
		||||
	"vbroadcastsd	48(%2), %%ymm6                  \n\t"  // real part x3
 | 
			
		||||
	"vbroadcastsd	56(%2), %%ymm7                  \n\t"  // imag part x3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"prefetcht0      192(%4,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      192(%5,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%5,%0,8), %%ymm10              \n\t" // 2 complex values form a1
 | 
			
		||||
	"vmovups      32(%5,%0,8), %%ymm11              \n\t" // 2 complex values form a1
 | 
			
		||||
 | 
			
		||||
	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      192(%6,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a2
 | 
			
		||||
	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a2
 | 
			
		||||
 | 
			
		||||
	"vfmadd231pd      %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vfmadd231pd      %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vfmadd231pd      %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vfmadd231pd      %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      192(%7,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%7,%0,8), %%ymm10              \n\t" // 2 complex values form a3
 | 
			
		||||
	"vmovups      32(%7,%0,8), %%ymm11              \n\t" // 2 complex values form a3
 | 
			
		||||
 | 
			
		||||
	"vfmadd231pd      %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vfmadd231pd      %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vfmadd231pd      %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vfmadd231pd      %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"vfmadd231pd      %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
 | 
			
		||||
	"vfmadd231pd      %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
 | 
			
		||||
	"vfmadd231pd      %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 | 
			
		||||
	"vfmadd231pd      %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      192(%3,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
 | 
			
		||||
	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
 | 
			
		||||
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
 | 
			
		||||
#else
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
 | 
			
		||||
        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
 | 
			
		||||
	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
 | 
			
		||||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,149 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
 | 
			
		||||
	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
 | 
			
		||||
	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
 | 
			
		||||
	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
 | 
			
		||||
	"vbroadcastsd	32(%2), %%ymm4                  \n\t"  // real part x2
 | 
			
		||||
	"vbroadcastsd	40(%2), %%ymm5                  \n\t"  // imag part x2
 | 
			
		||||
	"vbroadcastsd	48(%2), %%ymm6                  \n\t"  // real part x3
 | 
			
		||||
	"vbroadcastsd	56(%2), %%ymm7                  \n\t"  // imag part x3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      256(%4,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm0 , %%ymm12	\n\t"
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm1 , %%ymm13	\n\t"
 | 
			
		||||
        "prefetcht0      256(%5,%0,8)                   \n\t"
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm0 , %%ymm14	\n\t"
 | 
			
		||||
	"vmovups	(%5,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm1 , %%ymm15	\n\t"
 | 
			
		||||
	"vmovups      32(%5,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm2 , %%ymm10	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm3 , %%ymm11	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
 | 
			
		||||
        "prefetcht0      256(%6,%0,8)                   \n\t"
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm2 , %%ymm10	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
 | 
			
		||||
	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm3 , %%ymm11	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm4 , %%ymm10	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm5 , %%ymm11	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
 | 
			
		||||
        "prefetcht0      256(%7,%0,8)                   \n\t"
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm4 , %%ymm10	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
 | 
			
		||||
	"vmovups	(%7,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm5 , %%ymm11	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups      32(%7,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm6 , %%ymm10	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
 | 
			
		||||
	"vmulpd		  %%ymm8 , %%ymm7 , %%ymm11	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm6 , %%ymm10	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
 | 
			
		||||
	"vmulpd		  %%ymm9 , %%ymm7 , %%ymm11	\n\t"
 | 
			
		||||
	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	"prefetcht0      256(%3,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
 | 
			
		||||
	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
 | 
			
		||||
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
 | 
			
		||||
#else
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
 | 
			
		||||
        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
 | 
			
		||||
	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
 | 
			
		||||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,272 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
 | 
			
		||||
#include "zgemv_t_microk_bulldozer-2.c"
 | 
			
		||||
#elif defined(HASWELL)
 | 
			
		||||
#include "zgemv_t_microk_haswell-2.c"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define NBMAX 1028
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_16x4
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0,*a1,*a2,*a3;
 | 
			
		||||
	a0 = ap[0];
 | 
			
		||||
	a1 = ap[1];
 | 
			
		||||
	a2 = ap[2];
 | 
			
		||||
	a3 = ap[3];
 | 
			
		||||
	FLOAT temp_r0 = 0.0;
 | 
			
		||||
	FLOAT temp_r1 = 0.0;
 | 
			
		||||
	FLOAT temp_r2 = 0.0;
 | 
			
		||||
	FLOAT temp_r3 = 0.0;
 | 
			
		||||
	FLOAT temp_i0 = 0.0;
 | 
			
		||||
	FLOAT temp_i1 = 0.0;
 | 
			
		||||
	FLOAT temp_i2 = 0.0;
 | 
			
		||||
	FLOAT temp_i3 = 0.0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		temp_r0 += a0[i]*x[i]   - a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];		
 | 
			
		||||
		temp_r1 += a1[i]*x[i]   - a1[i+1]*x[i+1];		
 | 
			
		||||
		temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];		
 | 
			
		||||
		temp_r2 += a2[i]*x[i]   - a2[i+1]*x[i+1];		
 | 
			
		||||
		temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];		
 | 
			
		||||
		temp_r3 += a3[i]*x[i]   - a3[i+1]*x[i+1];		
 | 
			
		||||
		temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];		
 | 
			
		||||
#else
 | 
			
		||||
		temp_r0 += a0[i]*x[i]   + a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];		
 | 
			
		||||
		temp_r1 += a1[i]*x[i]   + a1[i+1]*x[i+1];		
 | 
			
		||||
		temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];		
 | 
			
		||||
		temp_r2 += a2[i]*x[i]   + a2[i+1]*x[i+1];		
 | 
			
		||||
		temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];		
 | 
			
		||||
		temp_r3 += a3[i]*x[i]   + a3[i+1]*x[i+1];		
 | 
			
		||||
		temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];		
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
	y[0] = temp_r0;
 | 
			
		||||
	y[1] = temp_i0;
 | 
			
		||||
	y[2] = temp_r1;
 | 
			
		||||
	y[3] = temp_i1;
 | 
			
		||||
	y[4] = temp_r2;
 | 
			
		||||
	y[5] = temp_i2;
 | 
			
		||||
	y[6] = temp_r3;
 | 
			
		||||
	y[7] = temp_i3;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	FLOAT *a0;
 | 
			
		||||
	a0 = ap;
 | 
			
		||||
	FLOAT temp_r = 0.0;
 | 
			
		||||
	FLOAT temp_i = 0.0;
 | 
			
		||||
 | 
			
		||||
	for ( i=0; i< 2*n; i+=2 )
 | 
			
		||||
	{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
		temp_r += a0[i]*x[i]   - a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];		
 | 
			
		||||
#else
 | 
			
		||||
		temp_r += a0[i]*x[i]   + a0[i+1]*x[i+1];		
 | 
			
		||||
		temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];		
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
	*y      = temp_r;
 | 
			
		||||
	*(y+1)  = temp_i;
 | 
			
		||||
}
 | 
			
		||||
	
 | 
			
		||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 | 
			
		||||
{
 | 
			
		||||
        BLASLONG i;
 | 
			
		||||
        for ( i=0; i<n; i++ )
 | 
			
		||||
        {
 | 
			
		||||
                *dest     = *src;
 | 
			
		||||
                *(dest+1) = *(src+1);
 | 
			
		||||
                dest+=2;
 | 
			
		||||
                src += inc_src;
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 | 
			
		||||
{
 | 
			
		||||
	BLASLONG i;
 | 
			
		||||
	BLASLONG j;
 | 
			
		||||
	FLOAT *a_ptr;
 | 
			
		||||
	FLOAT *x_ptr;
 | 
			
		||||
	FLOAT *y_ptr;
 | 
			
		||||
	FLOAT *ap[8];
 | 
			
		||||
	BLASLONG n1;
 | 
			
		||||
	BLASLONG m1;
 | 
			
		||||
	BLASLONG m2;
 | 
			
		||||
	BLASLONG n2;
 | 
			
		||||
	FLOAT ybuffer[8],*xbuffer;
 | 
			
		||||
 | 
			
		||||
        if ( m < 1 ) return(0);
 | 
			
		||||
        if ( n < 1 ) return(0);
 | 
			
		||||
 | 
			
		||||
        inc_x *= 2;
 | 
			
		||||
        inc_y *= 2;
 | 
			
		||||
        lda   *= 2;
 | 
			
		||||
 | 
			
		||||
	xbuffer = buffer;
 | 
			
		||||
	
 | 
			
		||||
	n1 = n / 4 ;
 | 
			
		||||
	n2 = n % 4 ;
 | 
			
		||||
	
 | 
			
		||||
	m1 = m - ( m % 16 );
 | 
			
		||||
	m2 = (m % NBMAX) - (m % 16) ;
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
	BLASLONG NB = NBMAX;
 | 
			
		||||
 | 
			
		||||
	while ( NB == NBMAX )
 | 
			
		||||
	{
 | 
			
		||||
		
 | 
			
		||||
		m1 -= NB;
 | 
			
		||||
		if ( m1 < 0)
 | 
			
		||||
		{
 | 
			
		||||
			if ( m2 == 0 ) break;	
 | 
			
		||||
			NB = m2;
 | 
			
		||||
		}
 | 
			
		||||
		
 | 
			
		||||
		y_ptr = y;
 | 
			
		||||
		a_ptr = a;
 | 
			
		||||
		x_ptr = x;
 | 
			
		||||
		copy_x(NB,x_ptr,xbuffer,inc_x);
 | 
			
		||||
		for( i = 0; i < n1 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			ap[0] = a_ptr;
 | 
			
		||||
			ap[1] = a_ptr + lda;
 | 
			
		||||
			ap[2] = ap[1] + lda;
 | 
			
		||||
			ap[3] = ap[2] + lda;
 | 
			
		||||
			zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 4 * lda;
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ)
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#else
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#endif
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for( i = 0; i < n2 ; i++)
 | 
			
		||||
		{
 | 
			
		||||
			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 | 
			
		||||
			a_ptr += 1 * lda;
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ)
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#else
 | 
			
		||||
			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
 | 
			
		||||
			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
 | 
			
		||||
			y_ptr  += inc_y;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
		a += 2* NB;
 | 
			
		||||
		x += NB * inc_x;	
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	BLASLONG m3 = m % 16;
 | 
			
		||||
	if ( m3 == 0 ) return(0);
 | 
			
		||||
 | 
			
		||||
	x_ptr = x;
 | 
			
		||||
	copy_x(m3,x_ptr,xbuffer,inc_x);
 | 
			
		||||
	j=0;
 | 
			
		||||
	a_ptr = a;
 | 
			
		||||
	y_ptr = y;
 | 
			
		||||
	while ( j < n)
 | 
			
		||||
	{
 | 
			
		||||
		FLOAT temp_r = 0.0;
 | 
			
		||||
		FLOAT temp_i = 0.0;
 | 
			
		||||
		for( i = 0; i < m3*2; i+=2 )
 | 
			
		||||
		{
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
			temp_r += a_ptr[i] * xbuffer[i]   - a_ptr[i+1] * xbuffer[i+1];
 | 
			
		||||
			temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
 | 
			
		||||
#else
 | 
			
		||||
			temp_r += a_ptr[i] * xbuffer[i]   + a_ptr[i+1] * xbuffer[i+1];
 | 
			
		||||
			temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
 | 
			
		||||
#endif
 | 
			
		||||
		}
 | 
			
		||||
		a_ptr += lda;
 | 
			
		||||
 | 
			
		||||
#if !defined(XCONJ) 
 | 
			
		||||
                y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
 | 
			
		||||
                y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
 | 
			
		||||
#else
 | 
			
		||||
                y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
 | 
			
		||||
                y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
		y_ptr += inc_y;
 | 
			
		||||
		j++;
 | 
			
		||||
	}
 | 
			
		||||
	return(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,180 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary froms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary from must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorpd		%%xmm8 , %%xmm8 , %%xmm8 	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%xmm9 , %%xmm9 , %%xmm9 	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%xmm10, %%xmm10, %%xmm10	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%xmm11, %%xmm11, %%xmm11	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%xmm12, %%xmm12, %%xmm12	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%xmm13, %%xmm13, %%xmm13	\n\t"
 | 
			
		||||
	"vxorpd		%%xmm14, %%xmm14, %%xmm14	\n\t"
 | 
			
		||||
	"vxorpd		%%xmm15, %%xmm15, %%xmm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
 | 
			
		||||
	"prefetcht0  192(%4,%0,8)                       \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
 | 
			
		||||
	"prefetcht0  192(%5,%0,8)                       \n\t"
 | 
			
		||||
	"vmovups	(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
 | 
			
		||||
	"prefetcht0  192(%6,%0,8)                       \n\t"
 | 
			
		||||
	"vmovups	(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
 | 
			
		||||
	"prefetcht0  192(%7,%0,8)                       \n\t"
 | 
			
		||||
	"vmovups	(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
 | 
			
		||||
 | 
			
		||||
	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
 | 
			
		||||
	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
 | 
			
		||||
	"vmovddup	 16(%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	 24(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
 | 
			
		||||
	"vmovups      16(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
 | 
			
		||||
	"vmovups      16(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
 | 
			
		||||
	"vmovups      16(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
 | 
			
		||||
	"vmovups      16(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
 | 
			
		||||
 | 
			
		||||
	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
 | 
			
		||||
	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
 | 
			
		||||
	"vmovddup	 32(%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	 40(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
 | 
			
		||||
	"vmovups      32(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
 | 
			
		||||
	"vmovups      32(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
 | 
			
		||||
	"vmovups      32(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
 | 
			
		||||
	"vmovups      32(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
 | 
			
		||||
 | 
			
		||||
	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
 | 
			
		||||
	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
 | 
			
		||||
	"vmovddup	 48(%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	 56(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
 | 
			
		||||
	"vmovups      48(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
 | 
			
		||||
	"vmovups      48(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
 | 
			
		||||
	"vmovups      48(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
 | 
			
		||||
	"vmovups      48(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
 | 
			
		||||
 | 
			
		||||
	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
 | 
			
		||||
	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm9 , %%xmm9                \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm11, %%xmm11               \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm13, %%xmm13               \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm15, %%xmm15               \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm9 , %%xmm8, %%xmm8               \n\t" 
 | 
			
		||||
        "vaddsubpd      %%xmm11, %%xmm10, %%xmm10             \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm13, %%xmm12, %%xmm12             \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm15, %%xmm14, %%xmm14             \n\t"
 | 
			
		||||
#else
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm8 , %%xmm8                \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm10, %%xmm10               \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm12, %%xmm12               \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm14, %%xmm14               \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm8 , %%xmm9 , %%xmm8              \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm10, %%xmm11, %%xmm10             \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm12, %%xmm13, %%xmm12             \n\t"
 | 
			
		||||
        "vaddsubpd      %%xmm14, %%xmm15, %%xmm14             \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm8 , %%xmm8                \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm10, %%xmm10               \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm12, %%xmm12               \n\t"
 | 
			
		||||
        "vpermilpd      $0x1 , %%xmm14, %%xmm14               \n\t"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	"vmovups	%%xmm8 ,   (%3)			\n\t"
 | 
			
		||||
	"vmovups	%%xmm10, 16(%3)			\n\t"
 | 
			
		||||
	"vmovups	%%xmm12, 32(%3)			\n\t"
 | 
			
		||||
	"vmovups	%%xmm14, 48(%3)			\n\t"
 | 
			
		||||
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,162 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2014, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary froms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary from must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
#define HAVE_KERNEL_16x4 1
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	BLASLONG register i = 0;
 | 
			
		||||
 | 
			
		||||
	__asm__  __volatile__
 | 
			
		||||
	(
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorpd		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
 | 
			
		||||
	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
 | 
			
		||||
	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
 | 
			
		||||
	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      192(%2,%0,8)                   \n\t"
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
        "prefetcht0      192(%4,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
 | 
			
		||||
	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
 | 
			
		||||
        "prefetcht0      192(%5,%0,8)                   \n\t"
 | 
			
		||||
	"vmovddup	 16(%2,%0,8), %%xmm2            \n\t"  // real value from x1
 | 
			
		||||
        "prefetcht0      192(%6,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
 | 
			
		||||
	"vmovddup	 24(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
 | 
			
		||||
        "prefetcht0      192(%7,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
 | 
			
		||||
	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
 | 
			
		||||
	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 | 
			
		||||
 | 
			
		||||
	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231pd      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231pd      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231pd      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
 | 
			
		||||
	"vmovups       32(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
 | 
			
		||||
	"vmovups       32(%5,%0,8), %%ymm5              \n\t" // 2 complex values from a1
 | 
			
		||||
	"vmovddup	 32(%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	 40(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
	"vmovddup	 48(%2,%0,8), %%xmm2            \n\t"  // real value from x1
 | 
			
		||||
	"vmovddup	 56(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
 | 
			
		||||
	"vmovups       32(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
 | 
			
		||||
	"vmovups       32(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
 | 
			
		||||
	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
 | 
			
		||||
	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 | 
			
		||||
 | 
			
		||||
	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231pd      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231pd      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
	"vfmadd231pd      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 | 
			
		||||
	"vfmadd231pd      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 | 
			
		||||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
 | 
			
		||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm11, %%ymm11               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm9 , %%ymm8, %%ymm8               \n\t" 
 | 
			
		||||
        "vaddsubpd      %%ymm11, %%ymm10, %%ymm10             \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm13, %%ymm12, %%ymm12             \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm15, %%ymm14, %%ymm14             \n\t"
 | 
			
		||||
#else
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm8 , %%ymm9 , %%ymm8              \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm10, %%ymm11, %%ymm10             \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm12, %%ymm13, %%ymm12             \n\t"
 | 
			
		||||
        "vaddsubpd      %%ymm14, %%ymm15, %%ymm14             \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
 | 
			
		||||
        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
 | 
			
		||||
	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
 | 
			
		||||
	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
 | 
			
		||||
	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddpd		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
 | 
			
		||||
	"vaddpd		%%xmm10, %%xmm11, %%xmm10      \n\t"
 | 
			
		||||
	"vaddpd		%%xmm12, %%xmm13, %%xmm12      \n\t"
 | 
			
		||||
	"vaddpd		%%xmm14, %%xmm15, %%xmm14      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	%%xmm8 ,   (%3)			\n\t"
 | 
			
		||||
	"vmovups	%%xmm10, 16(%3)			\n\t"
 | 
			
		||||
	"vmovups	%%xmm12, 32(%3)			\n\t"
 | 
			
		||||
	"vmovups	%%xmm14, 48(%3)			\n\t"
 | 
			
		||||
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
          "r" (i),	// 0	
 | 
			
		||||
	  "r" (n),  	// 1
 | 
			
		||||
          "r" (x),      // 2
 | 
			
		||||
          "r" (y),      // 3
 | 
			
		||||
          "r" (ap[0]),  // 4
 | 
			
		||||
          "r" (ap[1]),  // 5
 | 
			
		||||
          "r" (ap[2]),  // 6
 | 
			
		||||
          "r" (ap[3])   // 7
 | 
			
		||||
	: "cc", 
 | 
			
		||||
	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 | 
			
		||||
	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 | 
			
		||||
	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 | 
			
		||||
	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 | 
			
		||||
	  "memory"
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1,8 +1,8 @@
 | 
			
		|||
Data file for testing DSGESV/DSPOSV LAPACK routines
 | 
			
		||||
12                                      Number of values of M
 | 
			
		||||
0 1 2 13 17 45 78 91 101 119 120 132    values of M (row dimension)
 | 
			
		||||
4                                       Number of values of NRHS
 | 
			
		||||
1 2 14 16                               Values of NRHS (number of right hand sides)
 | 
			
		||||
6                                       Number of values of NRHS
 | 
			
		||||
1 2 14 15 16 13                         Values of NRHS (number of right hand sides)
 | 
			
		||||
30.0                                    Threshold value of test ratio
 | 
			
		||||
T                                       Put T to test the driver routine
 | 
			
		||||
T                                       Put T to test the error exits
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue