corrected and testet FMA3 Code

This commit is contained in:
wernsaar 2013-10-19 10:52:20 +02:00
parent f51a849d91
commit f6b50057e2
4 changed files with 83 additions and 28 deletions

View File

@ -37,6 +37,11 @@
/*********************************************************************/
/*********************************************************************
* 2013/10/19 Saar
* BLASTEST :
* CTEST : OK
* TEST : OK
*
* 2013/08/16 Saar
* Parameter:
* CGEMM_DEFAULT_UNROLL_N 2
@ -139,7 +144,7 @@
#endif
#if defined(BULLDOZER) || defined(PILEDRIVER)
#if defined(BULLDOZER)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
@ -188,41 +193,41 @@
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
.macro VFMADDPS_R y0,y1,y2
vfmadd231ps \y0,\y1,\y2
vfmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2
vfmadd231ps \y0,\y1,\y2
vfmadd231ps \y1,\y2,\y0
.endm
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
.macro VFMADDPS_R y0,y1,y2
vfnmadd231ps \y0,\y1,\y2
vfnmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2
vfmadd231ps \y0,\y1,\y2
vfmadd231ps \y1,\y2,\y0
.endm
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
.macro VFMADDPS_R y0,y1,y2
vfmadd231ps \y0,\y1,\y2
vfmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2
vfnmadd231ps \y0,\y1,\y2
vfnmadd231ps \y1,\y2,\y0
.endm
#else
.macro VFMADDPS_R y0,y1,y2
vfnmadd231ps \y0,\y1,\y2
vfnmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2
vfnmadd231ps \y0,\y1,\y2
vfnmadd231ps \y1,\y2,\y0
.endm
#endif

View File

@ -36,6 +36,31 @@
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
/*********************************************************************
* 2013/10/19 Saar
* BLASTEST :
* CTEST : OK
* TEST : OK
*
*
* 2013/08/15 Saar
* Parameter:
* SGEMM_DEFAULT_UNROLL_N 2
* SGEMM_DEFAULT_UNROLL_M 16
* SGEMM_DEFAULT_P 384
* SGEMM_DEFAULT_Q 168
*
* BLASTEST: OK
*
* Performance:
* 1 thread: 2.31 times faster than sandybridge
* 4 threads: 2.26 times faster than sandybridge
*
* Compile for FMA3: OK
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
@ -130,11 +155,11 @@
#else
.macro VFMADD231PD_ y0,y1,y2
vfmadd231pd \y0,\y1,\y2
vfmadd231pd \y2,\y1,\y0
.endm
.macro VFMADD231SD_ x0,x1,x2
vfmadd231sd \x0,\x1,\x2
vfmadd231sd \x2,\x1,\x0
.endm
#endif

View File

@ -36,6 +36,28 @@
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
/*********************************************************************
* 2013/10/19 Saar
* BLASTEST :
* CTEST : OK
* TEST : OK
*
* 2013/08/15 Saar
* Parameter:
* SGEMM_DEFAULT_UNROLL_N 4
* SGEMM_DEFAULT_UNROLL_M 16
* SGEMM_DEFAULT_P 768
* SGEMM_DEFAULT_Q 168
*
* BLASTEST: OK
*
* Performance:
* 1 thread: 2.22 times faster than sandybridge
* 4 threads: 2.26 times faster than sandybridge
*
* Compile for FMA3: OK
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
@ -60,7 +82,6 @@
#define SP %rbx
#define BO1 %rdi
#define BO2 %r15
#define CO2 %rdx
#ifndef WINDOWS_ABI
@ -131,11 +152,11 @@
#else
.macro VFMADD231PS_ y0,y1,y2
vfmadd231ps \y0,\y1,\y2
vfmadd231ps \y1,\y2,\y0
.endm
.macro VFMADD231SS_ x0,x1,x2
vfmadd231ss \x0,\x1,\x2
vfmadd231ss \x1,\x2,\x0
.endm
#endif
@ -791,7 +812,7 @@
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
@ -836,8 +857,8 @@
#ifdef TRMMKERNEL
vmovss %xmm12, OFFSET
vmovss %xmm12, KK
vmovsd %xmm12, OFFSET
vmovsd %xmm12, KK
#ifndef LEFT
negq KK
#endif
@ -1629,7 +1650,7 @@
.L4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
addq $4, KK
#endif
decq J // j --

View File

@ -37,6 +37,11 @@
/*********************************************************************/
/*********************************************************************
* 2013/10/19 Saar
* BLASTEST :
* CTEST : OK
* TEST : OK
*
* 2013/08/16 Saar
* Parameter:
* ZGEMM_DEFAULT_UNROLL_N 2
@ -44,7 +49,6 @@
* ZGEMM_DEFAULT_P 112
* ZGEMM_DEFAULT_Q 224
*
* BLASTEST: OK
*
* Performance:
* 1 thread: 1.80 times faster than sandybridge
@ -138,7 +142,7 @@
#endif
#if defined(BULLDOZER) || defined(PILEDRIVER)
#if defined(BULLDOZER)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
@ -187,41 +191,41 @@
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
.macro VFMADDPD_R y0,y1,y2
vfmadd231pd \y0,\y1,\y2
vfmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2
vfmadd231pd \y0,\y1,\y2
vfmadd231pd \y1,\y2,\y0
.endm
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
.macro VFMADDPD_R y0,y1,y2
vfnmadd231pd \y0,\y1,\y2
vfnmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2
vfmadd231pd \y0,\y1,\y2
vfmadd231pd \y1,\y2,\y0
.endm
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
.macro VFMADDPD_R y0,y1,y2
vfmadd231pd \y0,\y1,\y2
vfmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2
vfnmadd231pd \y0,\y1,\y2
vfnmadd231pd \y1,\y2,\y0
.endm
#else
.macro VFMADDPD_R y0,y1,y2
vfnmadd231pd \y0,\y1,\y2
vfnmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2
vfnmadd231pd \y0,\y1,\y2
vfnmadd231pd \y1,\y2,\y0
.endm
#endif