corrected and testet FMA3 Code
This commit is contained in:
parent
f51a849d91
commit
f6b50057e2
|
@ -37,6 +37,11 @@
|
|||
/*********************************************************************/
|
||||
|
||||
/*********************************************************************
|
||||
* 2013/10/19 Saar
|
||||
* BLASTEST :
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
* 2013/08/16 Saar
|
||||
* Parameter:
|
||||
* CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -139,7 +144,7 @@
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#if defined(BULLDOZER)
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
|
@ -188,41 +193,41 @@
|
|||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
.macro VFMADDPS_R y0,y1,y2
|
||||
vfmadd231ps \y0,\y1,\y2
|
||||
vfmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPS_I y0,y1,y2
|
||||
vfmadd231ps \y0,\y1,\y2
|
||||
vfmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
|
||||
.macro VFMADDPS_R y0,y1,y2
|
||||
vfnmadd231ps \y0,\y1,\y2
|
||||
vfnmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPS_I y0,y1,y2
|
||||
vfmadd231ps \y0,\y1,\y2
|
||||
vfmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
|
||||
.macro VFMADDPS_R y0,y1,y2
|
||||
vfmadd231ps \y0,\y1,\y2
|
||||
vfmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPS_I y0,y1,y2
|
||||
vfnmadd231ps \y0,\y1,\y2
|
||||
vfnmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro VFMADDPS_R y0,y1,y2
|
||||
vfnmadd231ps \y0,\y1,\y2
|
||||
vfnmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPS_I y0,y1,y2
|
||||
vfnmadd231ps \y0,\y1,\y2
|
||||
vfnmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
|
|
@ -36,6 +36,31 @@
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
/*********************************************************************
|
||||
* 2013/10/19 Saar
|
||||
* BLASTEST :
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
||||
*
|
||||
*
|
||||
* 2013/08/15 Saar
|
||||
* Parameter:
|
||||
* SGEMM_DEFAULT_UNROLL_N 2
|
||||
* SGEMM_DEFAULT_UNROLL_M 16
|
||||
* SGEMM_DEFAULT_P 384
|
||||
* SGEMM_DEFAULT_Q 168
|
||||
*
|
||||
* BLASTEST: OK
|
||||
*
|
||||
* Performance:
|
||||
* 1 thread: 2.31 times faster than sandybridge
|
||||
* 4 threads: 2.26 times faster than sandybridge
|
||||
*
|
||||
* Compile for FMA3: OK
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
@ -130,11 +155,11 @@
|
|||
#else
|
||||
|
||||
.macro VFMADD231PD_ y0,y1,y2
|
||||
vfmadd231pd \y0,\y1,\y2
|
||||
vfmadd231pd \y2,\y1,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADD231SD_ x0,x1,x2
|
||||
vfmadd231sd \x0,\x1,\x2
|
||||
vfmadd231sd \x2,\x1,\x0
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
|
|
@ -36,6 +36,28 @@
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
/*********************************************************************
|
||||
* 2013/10/19 Saar
|
||||
* BLASTEST :
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
* 2013/08/15 Saar
|
||||
* Parameter:
|
||||
* SGEMM_DEFAULT_UNROLL_N 4
|
||||
* SGEMM_DEFAULT_UNROLL_M 16
|
||||
* SGEMM_DEFAULT_P 768
|
||||
* SGEMM_DEFAULT_Q 168
|
||||
*
|
||||
* BLASTEST: OK
|
||||
*
|
||||
* Performance:
|
||||
* 1 thread: 2.22 times faster than sandybridge
|
||||
* 4 threads: 2.26 times faster than sandybridge
|
||||
*
|
||||
* Compile for FMA3: OK
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
@ -60,7 +82,6 @@
|
|||
#define SP %rbx
|
||||
|
||||
#define BO1 %rdi
|
||||
#define BO2 %r15
|
||||
#define CO2 %rdx
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
@ -131,11 +152,11 @@
|
|||
#else
|
||||
|
||||
.macro VFMADD231PS_ y0,y1,y2
|
||||
vfmadd231ps \y0,\y1,\y2
|
||||
vfmadd231ps \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADD231SS_ x0,x1,x2
|
||||
vfmadd231ss \x0,\x1,\x2
|
||||
vfmadd231ss \x1,\x2,\x0
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
@ -791,7 +812,7 @@
|
|||
movq OLD_C, C
|
||||
movq OLD_LDC, LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd OLD_OFFSET, %xmm12
|
||||
vmovsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
|
||||
|
@ -836,8 +857,8 @@
|
|||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
vmovss %xmm12, OFFSET
|
||||
vmovss %xmm12, KK
|
||||
vmovsd %xmm12, OFFSET
|
||||
vmovsd %xmm12, KK
|
||||
#ifndef LEFT
|
||||
negq KK
|
||||
#endif
|
||||
|
@ -1629,7 +1650,7 @@
|
|||
|
||||
.L4_60:
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
addq $2, KK
|
||||
addq $4, KK
|
||||
#endif
|
||||
|
||||
decq J // j --
|
||||
|
|
|
@ -37,6 +37,11 @@
|
|||
/*********************************************************************/
|
||||
|
||||
/*********************************************************************
|
||||
* 2013/10/19 Saar
|
||||
* BLASTEST :
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
* 2013/08/16 Saar
|
||||
* Parameter:
|
||||
* ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -44,7 +49,6 @@
|
|||
* ZGEMM_DEFAULT_P 112
|
||||
* ZGEMM_DEFAULT_Q 224
|
||||
*
|
||||
* BLASTEST: OK
|
||||
*
|
||||
* Performance:
|
||||
* 1 thread: 1.80 times faster than sandybridge
|
||||
|
@ -138,7 +142,7 @@
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#if defined(BULLDOZER)
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
|
@ -187,41 +191,41 @@
|
|||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
.macro VFMADDPD_R y0,y1,y2
|
||||
vfmadd231pd \y0,\y1,\y2
|
||||
vfmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPD_I y0,y1,y2
|
||||
vfmadd231pd \y0,\y1,\y2
|
||||
vfmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
|
||||
.macro VFMADDPD_R y0,y1,y2
|
||||
vfnmadd231pd \y0,\y1,\y2
|
||||
vfnmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPD_I y0,y1,y2
|
||||
vfmadd231pd \y0,\y1,\y2
|
||||
vfmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
|
||||
.macro VFMADDPD_R y0,y1,y2
|
||||
vfmadd231pd \y0,\y1,\y2
|
||||
vfmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPD_I y0,y1,y2
|
||||
vfnmadd231pd \y0,\y1,\y2
|
||||
vfnmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro VFMADDPD_R y0,y1,y2
|
||||
vfnmadd231pd \y0,\y1,\y2
|
||||
vfnmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
.macro VFMADDPD_I y0,y1,y2
|
||||
vfnmadd231pd \y0,\y1,\y2
|
||||
vfnmadd231pd \y1,\y2,\y0
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue