corrected and testet FMA3 Code
This commit is contained in:
parent
f51a849d91
commit
f6b50057e2
|
@ -37,6 +37,11 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
/*********************************************************************
|
/*********************************************************************
|
||||||
|
* 2013/10/19 Saar
|
||||||
|
* BLASTEST :
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
*
|
||||||
* 2013/08/16 Saar
|
* 2013/08/16 Saar
|
||||||
* Parameter:
|
* Parameter:
|
||||||
* CGEMM_DEFAULT_UNROLL_N 2
|
* CGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
@ -139,7 +144,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
#if defined(BULLDOZER)
|
||||||
|
|
||||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
|
||||||
|
@ -188,41 +193,41 @@
|
||||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
|
||||||
.macro VFMADDPS_R y0,y1,y2
|
.macro VFMADDPS_R y0,y1,y2
|
||||||
vfmadd231ps \y0,\y1,\y2
|
vfmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPS_I y0,y1,y2
|
.macro VFMADDPS_I y0,y1,y2
|
||||||
vfmadd231ps \y0,\y1,\y2
|
vfmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||||
|
|
||||||
.macro VFMADDPS_R y0,y1,y2
|
.macro VFMADDPS_R y0,y1,y2
|
||||||
vfnmadd231ps \y0,\y1,\y2
|
vfnmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPS_I y0,y1,y2
|
.macro VFMADDPS_I y0,y1,y2
|
||||||
vfmadd231ps \y0,\y1,\y2
|
vfmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||||
|
|
||||||
.macro VFMADDPS_R y0,y1,y2
|
.macro VFMADDPS_R y0,y1,y2
|
||||||
vfmadd231ps \y0,\y1,\y2
|
vfmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPS_I y0,y1,y2
|
.macro VFMADDPS_I y0,y1,y2
|
||||||
vfnmadd231ps \y0,\y1,\y2
|
vfnmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
.macro VFMADDPS_R y0,y1,y2
|
.macro VFMADDPS_R y0,y1,y2
|
||||||
vfnmadd231ps \y0,\y1,\y2
|
vfnmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPS_I y0,y1,y2
|
.macro VFMADDPS_I y0,y1,y2
|
||||||
vfnmadd231ps \y0,\y1,\y2
|
vfnmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -36,6 +36,31 @@
|
||||||
/* or implied, of The University of Texas at Austin. */
|
/* or implied, of The University of Texas at Austin. */
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/*********************************************************************
|
||||||
|
* 2013/10/19 Saar
|
||||||
|
* BLASTEST :
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* 2013/08/15 Saar
|
||||||
|
* Parameter:
|
||||||
|
* SGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
* SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
* SGEMM_DEFAULT_P 384
|
||||||
|
* SGEMM_DEFAULT_Q 168
|
||||||
|
*
|
||||||
|
* BLASTEST: OK
|
||||||
|
*
|
||||||
|
* Performance:
|
||||||
|
* 1 thread: 2.31 times faster than sandybridge
|
||||||
|
* 4 threads: 2.26 times faster than sandybridge
|
||||||
|
*
|
||||||
|
* Compile for FMA3: OK
|
||||||
|
*
|
||||||
|
*********************************************************************/
|
||||||
|
|
||||||
|
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
@ -130,11 +155,11 @@
|
||||||
#else
|
#else
|
||||||
|
|
||||||
.macro VFMADD231PD_ y0,y1,y2
|
.macro VFMADD231PD_ y0,y1,y2
|
||||||
vfmadd231pd \y0,\y1,\y2
|
vfmadd231pd \y2,\y1,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADD231SD_ x0,x1,x2
|
.macro VFMADD231SD_ x0,x1,x2
|
||||||
vfmadd231sd \x0,\x1,\x2
|
vfmadd231sd \x2,\x1,\x0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -36,6 +36,28 @@
|
||||||
/* or implied, of The University of Texas at Austin. */
|
/* or implied, of The University of Texas at Austin. */
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/*********************************************************************
|
||||||
|
* 2013/10/19 Saar
|
||||||
|
* BLASTEST :
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
*
|
||||||
|
* 2013/08/15 Saar
|
||||||
|
* Parameter:
|
||||||
|
* SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
* SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
* SGEMM_DEFAULT_P 768
|
||||||
|
* SGEMM_DEFAULT_Q 168
|
||||||
|
*
|
||||||
|
* BLASTEST: OK
|
||||||
|
*
|
||||||
|
* Performance:
|
||||||
|
* 1 thread: 2.22 times faster than sandybridge
|
||||||
|
* 4 threads: 2.26 times faster than sandybridge
|
||||||
|
*
|
||||||
|
* Compile for FMA3: OK
|
||||||
|
*
|
||||||
|
*********************************************************************/
|
||||||
|
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
@ -60,7 +82,6 @@
|
||||||
#define SP %rbx
|
#define SP %rbx
|
||||||
|
|
||||||
#define BO1 %rdi
|
#define BO1 %rdi
|
||||||
#define BO2 %r15
|
|
||||||
#define CO2 %rdx
|
#define CO2 %rdx
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
@ -131,11 +152,11 @@
|
||||||
#else
|
#else
|
||||||
|
|
||||||
.macro VFMADD231PS_ y0,y1,y2
|
.macro VFMADD231PS_ y0,y1,y2
|
||||||
vfmadd231ps \y0,\y1,\y2
|
vfmadd231ps \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADD231SS_ x0,x1,x2
|
.macro VFMADD231SS_ x0,x1,x2
|
||||||
vfmadd231ss \x0,\x1,\x2
|
vfmadd231ss \x1,\x2,\x0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -791,7 +812,7 @@
|
||||||
movq OLD_C, C
|
movq OLD_C, C
|
||||||
movq OLD_LDC, LDC
|
movq OLD_LDC, LDC
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
movsd OLD_OFFSET, %xmm12
|
vmovsd OLD_OFFSET, %xmm12
|
||||||
#endif
|
#endif
|
||||||
vmovaps %xmm3, %xmm0
|
vmovaps %xmm3, %xmm0
|
||||||
|
|
||||||
|
@ -836,8 +857,8 @@
|
||||||
|
|
||||||
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
vmovss %xmm12, OFFSET
|
vmovsd %xmm12, OFFSET
|
||||||
vmovss %xmm12, KK
|
vmovsd %xmm12, KK
|
||||||
#ifndef LEFT
|
#ifndef LEFT
|
||||||
negq KK
|
negq KK
|
||||||
#endif
|
#endif
|
||||||
|
@ -1629,7 +1650,7 @@
|
||||||
|
|
||||||
.L4_60:
|
.L4_60:
|
||||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
addq $2, KK
|
addq $4, KK
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
decq J // j --
|
decq J // j --
|
||||||
|
|
|
@ -37,6 +37,11 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
/*********************************************************************
|
/*********************************************************************
|
||||||
|
* 2013/10/19 Saar
|
||||||
|
* BLASTEST :
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
*
|
||||||
* 2013/08/16 Saar
|
* 2013/08/16 Saar
|
||||||
* Parameter:
|
* Parameter:
|
||||||
* ZGEMM_DEFAULT_UNROLL_N 2
|
* ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
@ -44,7 +49,6 @@
|
||||||
* ZGEMM_DEFAULT_P 112
|
* ZGEMM_DEFAULT_P 112
|
||||||
* ZGEMM_DEFAULT_Q 224
|
* ZGEMM_DEFAULT_Q 224
|
||||||
*
|
*
|
||||||
* BLASTEST: OK
|
|
||||||
*
|
*
|
||||||
* Performance:
|
* Performance:
|
||||||
* 1 thread: 1.80 times faster than sandybridge
|
* 1 thread: 1.80 times faster than sandybridge
|
||||||
|
@ -138,7 +142,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
#if defined(BULLDOZER)
|
||||||
|
|
||||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
|
||||||
|
@ -187,41 +191,41 @@
|
||||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
|
||||||
.macro VFMADDPD_R y0,y1,y2
|
.macro VFMADDPD_R y0,y1,y2
|
||||||
vfmadd231pd \y0,\y1,\y2
|
vfmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPD_I y0,y1,y2
|
.macro VFMADDPD_I y0,y1,y2
|
||||||
vfmadd231pd \y0,\y1,\y2
|
vfmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||||
|
|
||||||
.macro VFMADDPD_R y0,y1,y2
|
.macro VFMADDPD_R y0,y1,y2
|
||||||
vfnmadd231pd \y0,\y1,\y2
|
vfnmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPD_I y0,y1,y2
|
.macro VFMADDPD_I y0,y1,y2
|
||||||
vfmadd231pd \y0,\y1,\y2
|
vfmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||||
|
|
||||||
.macro VFMADDPD_R y0,y1,y2
|
.macro VFMADDPD_R y0,y1,y2
|
||||||
vfmadd231pd \y0,\y1,\y2
|
vfmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPD_I y0,y1,y2
|
.macro VFMADDPD_I y0,y1,y2
|
||||||
vfnmadd231pd \y0,\y1,\y2
|
vfnmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
.macro VFMADDPD_R y0,y1,y2
|
.macro VFMADDPD_R y0,y1,y2
|
||||||
vfnmadd231pd \y0,\y1,\y2
|
vfnmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro VFMADDPD_I y0,y1,y2
|
.macro VFMADDPD_I y0,y1,y2
|
||||||
vfnmadd231pd \y0,\y1,\y2
|
vfnmadd231pd \y1,\y2,\y0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue