changes for compatibility with Pathscale compiler

This commit is contained in:
wernsaar 2013-11-13 17:59:11 +01:00
parent 6216ab8a7e
commit f1db386211
5 changed files with 178 additions and 238 deletions

View File

@ -171,11 +171,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd #define MMXSTORE movd
#endif #endif
#if defined(SANDYBRIDGE) || defined(HASWELL)
//Enable some optimazation for nehalem.
#define NEHALEM_OPTIMIZATION
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) #if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona. //Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION #define BARCELONA_OPTIMIZATION
@ -306,12 +301,25 @@ REALNAME:
#define PROFCODE #define PROFCODE
#endif #endif
#if defined(C_PATHSCALE) || defined(OS_DARWIN)
#define EPILOGUE \ #define EPILOGUE \
.size REALNAME, .-REALNAME; \ .size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",@progbits
#else
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits .section .note.GNU-stack,"",%progbits
#endif #endif
#endif
#ifdef XDOUBLE #ifdef XDOUBLE
#define FLD fldt #define FLD fldt
#define FST fstpt #define FST fstpt

View File

@ -218,12 +218,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER #ifdef ASSEMBLER
#if defined(SANDYBRIDGE) || defined(HASWELL)
//Enable some optimazation for nehalem.
#define NEHALEM_OPTIMIZATION
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) #if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona. //Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION #define BARCELONA_OPTIMIZATION
@ -378,10 +372,20 @@ REALNAME:
#define PROFCODE #define PROFCODE
#endif #endif
#if defined(C_PATHSCALE) || defined(OS_DARWIN)
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",@progbits
#else
#define EPILOGUE \ #define EPILOGUE \
.size REALNAME, .-REALNAME; \ .size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits .section .note.GNU-stack,"",%progbits
#endif
#endif #endif

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
/********************************************************************* /*********************************************************************
* 2013/10/28 Saar * 2013/11/13 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -138,43 +138,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
vfmaddps \y0,\y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
vfmaddps \y0,\y1,\y2,\y0
.endm
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
vfnmaddps \y0,\y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
vfmaddps \y0,\y1,\y2,\y0
.endm
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
vfmaddps \y0,\y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
vfnmaddps \y0,\y1,\y2,\y0
.endm
#else #else
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
vfnmaddps \y0,\y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
vfnmaddps \y0,\y1,\y2,\y0
.endm
#endif #endif
@ -182,43 +166,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
vfmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
vfmadd231ps \y1,\y2,\y0
.endm
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
vfnmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
vfmadd231ps \y1,\y2,\y0
.endm
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
vfmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
vfnmadd231ps \y1,\y2,\y0
.endm
#else #else
.macro VFMADDPS_R y0,y1,y2 #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
vfnmadd231ps \y1,\y2,\y0
.endm
.macro VFMADDPS_I y0,y1,y2 #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
vfnmadd231ps \y1,\y2,\y0
.endm
#endif #endif
@ -234,18 +202,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_R %ymm8,%ymm4,%ymm0 VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
VFMADDPS_R %ymm12,%ymm4,%ymm1 VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_I %ymm9,%ymm5,%ymm0 VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
VFMADDPS_I %ymm13,%ymm5,%ymm1 VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_R %ymm10,%ymm6,%ymm0 VFMADDPS_R( %ymm10,%ymm6,%ymm0 )
VFMADDPS_R %ymm14,%ymm6,%ymm1 VFMADDPS_R( %ymm14,%ymm6,%ymm1 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_I %ymm11,%ymm7,%ymm0 VFMADDPS_I( %ymm11,%ymm7,%ymm0 )
VFMADDPS_I %ymm15,%ymm7,%ymm1 VFMADDPS_I( %ymm15,%ymm7,%ymm1 )
addq $4 , BI addq $4 , BI
addq $16, %rax addq $16, %rax
.endm .endm
@ -338,18 +306,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x2_SUB .macro KERNEL4x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R %xmm8,%xmm4,%xmm0 VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPS_R %xmm12,%xmm4,%xmm1 VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I %xmm9,%xmm5,%xmm0 VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
VFMADDPS_I %xmm13,%xmm5,%xmm1 VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPS_R %xmm10,%xmm6,%xmm0 VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
VFMADDPS_R %xmm14,%xmm6,%xmm1 VFMADDPS_R( %xmm14,%xmm6,%xmm1 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPS_I %xmm11,%xmm7,%xmm0 VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
VFMADDPS_I %xmm15,%xmm7,%xmm1 VFMADDPS_I( %xmm15,%xmm7,%xmm1 )
addq $4, BI addq $4, BI
addq $8, %rax addq $8, %rax
.endm .endm
@ -437,13 +405,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_SUB .macro KERNEL2x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R %xmm8,%xmm4,%xmm0 VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I %xmm9,%xmm5,%xmm0 VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPS_R %xmm10,%xmm6,%xmm0 VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPS_I %xmm11,%xmm7,%xmm0 VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
addq $4, BI addq $4, BI
addq $4, %rax addq $4, %rax
.endm .endm
@ -509,13 +477,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL1x2_SUB .macro KERNEL1x2_SUB
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R %xmm8,%xmm4,%xmm0 VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I %xmm9,%xmm5,%xmm0 VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPS_R %xmm10,%xmm6,%xmm0 VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPS_I %xmm11,%xmm7,%xmm0 VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
addq $4, BI addq $4, BI
addq $2, %rax addq $2, %rax
.endm .endm
@ -583,11 +551,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_R %ymm8,%ymm4,%ymm0 VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
VFMADDPS_R %ymm12,%ymm4,%ymm1 VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_I %ymm9,%ymm5,%ymm0 VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
VFMADDPS_I %ymm13,%ymm5,%ymm1 VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
addq $2 , BI addq $2 , BI
addq $16, %rax addq $16, %rax
.endm .endm
@ -654,12 +622,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x1_SUB .macro KERNEL4x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R %xmm8,%xmm4,%xmm0 VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPS_R %xmm12,%xmm4,%xmm1 VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I %xmm9,%xmm5,%xmm0 VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
VFMADDPS_I %xmm13,%xmm5,%xmm1 VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
addq $2, BI addq $2, BI
addq $8, %rax addq $8, %rax
.endm .endm
@ -723,9 +691,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x1_SUB .macro KERNEL2x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R %xmm8,%xmm4,%xmm0 VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I %xmm9,%xmm5,%xmm0 VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
addq $2, BI addq $2, BI
addq $4, %rax addq $4, %rax
.endm .endm
@ -778,9 +746,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL1x1_SUB .macro KERNEL1x1_SUB
vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPS_R %xmm8,%xmm4,%xmm0 VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPS_I %xmm9,%xmm5,%xmm0 VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
addq $2, BI addq $2, BI
addq $2, %rax addq $2, %rax
.endm .endm

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
/********************************************************************* /*********************************************************************
* 2013/10/28 Saar * 2013/11/13 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -131,23 +131,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) #if defined(BULLDOZER)
.macro VFMADD231PS_ y0,y1,y2 #define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
vfmaddps \y0,\y1,\y2,\y0
.endm
.macro VFMADD231SS_ x0,x1,x2 #define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
vfmaddss \x0,\x1,\x2,\x0
.endm
#else #else
.macro VFMADD231PS_ y0,y1,y2 #define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
vfmadd231ps \y1,\y2,\y0
.endm
.macro VFMADD231SS_ x0,x1,x2 #define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
vfmadd231ss \x1,\x2,\x0
.endm
#endif #endif
@ -164,16 +156,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_ %ymm4,%ymm2,%ymm0 VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_ %ymm5,%ymm2,%ymm1 VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
VFMADD231PS_ %ymm6,%ymm3,%ymm0 VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
VFMADD231PS_ %ymm7,%ymm3,%ymm1 VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_ %ymm8,%ymm2,%ymm0 VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
VFMADD231PS_ %ymm9,%ymm2,%ymm1 VFMADD231PS_( %ymm9,%ymm2,%ymm1 )
VFMADD231PS_ %ymm10,%ymm3,%ymm0 VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
VFMADD231PS_ %ymm11,%ymm3,%ymm1 VFMADD231PS_( %ymm11,%ymm3,%ymm1 )
addq $4 , BI addq $4 , BI
addq $16, %rax addq $16, %rax
.endm .endm
@ -235,12 +227,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_ %ymm4,%ymm2,%ymm0 VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_ %ymm6,%ymm3,%ymm0 VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_ %ymm8,%ymm2,%ymm0 VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
VFMADD231PS_ %ymm10,%ymm3,%ymm0 VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
addq $4 , BI addq $4 , BI
addq $8 , %rax addq $8 , %rax
.endm .endm
@ -279,12 +271,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231PS_ %xmm4,%xmm2,%xmm0 VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
VFMADD231PS_ %xmm6,%xmm3,%xmm0 VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231PS_ %xmm8,%xmm2,%xmm0 VFMADD231PS_( %xmm8,%xmm2,%xmm0 )
VFMADD231PS_ %xmm10,%xmm3,%xmm0 VFMADD231PS_( %xmm10,%xmm3,%xmm0 )
addq $4 , BI addq $4 , BI
addq $4 , %rax addq $4 , %rax
.endm .endm
@ -323,16 +315,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_ %xmm4,%xmm2,%xmm0 VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_ %xmm5,%xmm2,%xmm1 VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
VFMADD231SS_ %xmm6,%xmm3,%xmm0 VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
VFMADD231SS_ %xmm7,%xmm3,%xmm1 VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_ %xmm8,%xmm2,%xmm0 VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
VFMADD231SS_ %xmm9,%xmm2,%xmm1 VFMADD231SS_( %xmm9,%xmm2,%xmm1 )
VFMADD231SS_ %xmm10,%xmm3,%xmm0 VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
VFMADD231SS_ %xmm11,%xmm3,%xmm1 VFMADD231SS_( %xmm11,%xmm3,%xmm1 )
addq $4 , BI addq $4 , BI
addq $2, %rax addq $2, %rax
.endm .endm
@ -388,12 +380,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_ %xmm4,%xmm2,%xmm0 VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_ %xmm6,%xmm3,%xmm0 VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_ %xmm8,%xmm2,%xmm0 VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
VFMADD231SS_ %xmm10,%xmm3,%xmm0 VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
addq $4 , BI addq $4 , BI
addq $1, %rax addq $1, %rax
.endm .endm
@ -436,10 +428,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_ %ymm4,%ymm2,%ymm0 VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_ %ymm5,%ymm2,%ymm1 VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
VFMADD231PS_ %ymm6,%ymm3,%ymm0 VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
VFMADD231PS_ %ymm7,%ymm3,%ymm1 VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
addq $2 , BI addq $2 , BI
addq $16, %rax addq $16, %rax
.endm .endm
@ -480,8 +472,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
VFMADD231PS_ %ymm4,%ymm2,%ymm0 VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_ %ymm6,%ymm3,%ymm0 VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
addq $2 , BI addq $2 , BI
addq $8 , %rax addq $8 , %rax
.endm .endm
@ -513,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231PS_ %xmm4,%xmm2,%xmm0 VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
VFMADD231PS_ %xmm6,%xmm3,%xmm0 VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
addq $2 , BI addq $2 , BI
addq $4 , %rax addq $4 , %rax
.endm .endm
@ -546,10 +538,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_ %xmm4,%xmm2,%xmm0 VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_ %xmm5,%xmm2,%xmm1 VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
VFMADD231SS_ %xmm6,%xmm3,%xmm0 VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
VFMADD231SS_ %xmm7,%xmm3,%xmm1 VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
addq $2 , BI addq $2 , BI
addq $2, %rax addq $2, %rax
.endm .endm
@ -589,8 +581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
VFMADD231SS_ %xmm4,%xmm2,%xmm0 VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_ %xmm6,%xmm3,%xmm0 VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
addq $2 , BI addq $2 , BI
addq $1, %rax addq $1, %rax
.endm .endm
@ -625,8 +617,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
VFMADD231PS_ %ymm4,%ymm2,%ymm0 VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
VFMADD231PS_ %ymm5,%ymm2,%ymm1 VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
addq $1 , BI addq $1 , BI
addq $16, %rax addq $16, %rax
.endm .endm
@ -656,7 +648,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x1_SUB .macro KERNEL8x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
VFMADD231PS_ %ymm4,%ymm2,%ymm0 VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
addq $1 , BI addq $1 , BI
addq $8 , %rax addq $8 , %rax
.endm .endm
@ -684,7 +676,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x1_SUB .macro KERNEL4x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
VFMADD231PS_ %xmm4,%xmm2,%xmm0 VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
addq $1 , BI addq $1 , BI
addq $4 , %rax addq $4 , %rax
.endm .endm
@ -712,8 +704,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
VFMADD231SS_ %xmm4,%xmm2,%xmm0 VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
VFMADD231SS_ %xmm5,%xmm2,%xmm1 VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
addq $1 , BI addq $1 , BI
addq $2, %rax addq $2, %rax
.endm .endm
@ -743,7 +735,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL1x1_SUB .macro KERNEL1x1_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
VFMADD231SS_ %xmm4,%xmm2,%xmm0 VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
addq $1 , BI addq $1 , BI
addq $1, %rax addq $1, %rax
.endm .endm

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
/******************************************************************************** /********************************************************************************
* 2013/10/28 Saar * 2013/11/13 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -137,43 +137,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
vfmaddpd \y0,\y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
vfmaddpd \y0,\y1,\y2,\y0
.endm
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
vfnmaddpd \y0,\y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
vfmaddpd \y0,\y1,\y2,\y0
.endm
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
vfmaddpd \y0,\y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
vfnmaddpd \y0,\y1,\y2,\y0
.endm
#else #else
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
vfnmaddpd \y0,\y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
vfnmaddpd \y0,\y1,\y2,\y0
.endm
#endif #endif
@ -181,43 +165,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
vfmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
vfmadd231pd \y1,\y2,\y0
.endm
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
vfnmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
vfmadd231pd \y1,\y2,\y0
.endm
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
vfmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
vfnmadd231pd \y1,\y2,\y0
.endm
#else #else
.macro VFMADDPD_R y0,y1,y2 #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
vfnmadd231pd \y1,\y2,\y0
.endm
.macro VFMADDPD_I y0,y1,y2 #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
vfnmadd231pd \y1,\y2,\y0
.endm
#endif #endif
@ -233,16 +201,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4
vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPD_R %ymm8 ,%ymm4,%ymm0 VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )
VFMADDPD_R %ymm12,%ymm4,%ymm1 VFMADDPD_R( %ymm12,%ymm4,%ymm1 )
vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPD_I %ymm9 ,%ymm5,%ymm0 VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )
VFMADDPD_I %ymm13,%ymm5,%ymm1 VFMADDPD_I( %ymm13,%ymm5,%ymm1 )
vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPD_R %ymm10,%ymm6,%ymm0 VFMADDPD_R( %ymm10,%ymm6,%ymm0 )
VFMADDPD_R %ymm14,%ymm6,%ymm1 VFMADDPD_R( %ymm14,%ymm6,%ymm1 )
VFMADDPD_I %ymm11,%ymm7,%ymm0 VFMADDPD_I( %ymm11,%ymm7,%ymm0 )
VFMADDPD_I %ymm15,%ymm7,%ymm1 VFMADDPD_I( %ymm15,%ymm7,%ymm1 )
addq $4, BI addq $4, BI
addq $8, %rax addq $8, %rax
@ -337,17 +305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPD_R %xmm8,%xmm4,%xmm0 VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
VFMADDPD_R %xmm12,%xmm4,%xmm1 VFMADDPD_R( %xmm12,%xmm4,%xmm1 )
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_I %xmm9,%xmm5,%xmm0 VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
VFMADDPD_I %xmm13,%xmm5,%xmm1 VFMADDPD_I( %xmm13,%xmm5,%xmm1 )
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6
VFMADDPD_R %xmm10,%xmm6,%xmm0 VFMADDPD_R( %xmm10,%xmm6,%xmm0 )
VFMADDPD_R %xmm14,%xmm6,%xmm1 VFMADDPD_R( %xmm14,%xmm6,%xmm1 )
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPD_I %xmm11,%xmm7,%xmm0 VFMADDPD_I( %xmm11,%xmm7,%xmm0 )
VFMADDPD_I %xmm15,%xmm7,%xmm1 VFMADDPD_I( %xmm15,%xmm7,%xmm1 )
addq $4, BI addq $4, BI
addq $4, %rax addq $4, %rax
.endm .endm
@ -441,12 +409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4
vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_R %xmm8,%xmm4,%xmm0 VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
VFMADDPD_I %xmm9,%xmm5,%xmm0 VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6
vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7
VFMADDPD_R %xmm10,%xmm6,%xmm0 VFMADDPD_R( %xmm10,%xmm6,%xmm0 )
VFMADDPD_I %xmm11,%xmm7,%xmm0 VFMADDPD_I( %xmm11,%xmm7,%xmm0 )
addq $4, BI addq $4, BI
addq $2, %rax addq $2, %rax
.endm .endm
@ -513,10 +481,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4
vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5
VFMADDPD_R %ymm8 ,%ymm4,%ymm0 VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )
VFMADDPD_R %ymm12,%ymm4,%ymm1 VFMADDPD_R( %ymm12,%ymm4,%ymm1 )
VFMADDPD_I %ymm9 ,%ymm5,%ymm0 VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )
VFMADDPD_I %ymm13,%ymm5,%ymm1 VFMADDPD_I( %ymm13,%ymm5,%ymm1 )
addq $2, BI addq $2, BI
addq $8, %rax addq $8, %rax
@ -585,12 +553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x1_SUB .macro KERNEL2x1_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPD_R %xmm8,%xmm4,%xmm0 VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1
VFMADDPD_R %xmm12,%xmm4,%xmm1 VFMADDPD_R( %xmm12,%xmm4,%xmm1 )
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_I %xmm9,%xmm5,%xmm0 VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
VFMADDPD_I %xmm13,%xmm5,%xmm1 VFMADDPD_I( %xmm13,%xmm5,%xmm1 )
addq $2, BI addq $2, BI
addq $4, %rax addq $4, %rax
.endm .endm
@ -655,9 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL1x1_SUB .macro KERNEL1x1_SUB
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4
VFMADDPD_R %xmm8,%xmm4,%xmm0 VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5
VFMADDPD_I %xmm9,%xmm5,%xmm0 VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
addq $2, BI addq $2, BI
addq $2, %rax addq $2, %rax
.endm .endm