optimized dgemv_t kernel for haswell
This commit is contained in:
parent
debc6d1a05
commit
8109d8232c
|
@ -28,17 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
/*
|
#if defined(HASWELL)
|
||||||
#if defined(NEHALEM)
|
|
||||||
#include "dgemv_t_microk_nehalem-4.c"
|
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
|
||||||
#include "dgemv_t_microk_bulldozer-4.c"
|
|
||||||
#elif defined(SANDYBRIDGE)
|
|
||||||
#include "dgemv_t_microk_sandy-4.c"
|
|
||||||
#elif defined(HASWELL)
|
|
||||||
#include "dgemv_t_microk_haswell-4.c"
|
#include "dgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
*/
|
|
||||||
|
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
|
|
||||||
|
|
|
@ -61,25 +61,25 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
".L01LOOP%=: \n\t"
|
||||||
"prefetcht0 384(%2,%0,8) \n\t"
|
// "prefetcht0 384(%2,%0,8) \n\t"
|
||||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||||
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
||||||
|
|
||||||
"prefetcht0 384(%4,%0,8) \n\t"
|
// "prefetcht0 384(%4,%0,8) \n\t"
|
||||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||||
"prefetcht0 384(%5,%0,8) \n\t"
|
// "prefetcht0 384(%5,%0,8) \n\t"
|
||||||
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
"prefetcht0 384(%6,%0,8) \n\t"
|
|
||||||
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
|
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
|
||||||
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
|
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
|
||||||
"prefetcht0 384(%7,%0,8) \n\t"
|
// "prefetcht0 384(%6,%0,8) \n\t"
|
||||||
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
|
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||||
"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
// "prefetcht0 384(%7,%0,8) \n\t"
|
||||||
|
"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
|
||||||
"subq $8 , %1 \n\t"
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz .L01LOOP%= \n\t"
|
||||||
|
|
||||||
".L16END%=: \n\t"
|
".L16END%=: \n\t"
|
||||||
|
|
Loading…
Reference in New Issue