Merge pull request #47 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-04-18 21:07:14 +02:00 committed by GitHub
commit 1c1ca2bc0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 147 additions and 101 deletions

View File

@ -8,7 +8,7 @@ platform:
steps: steps:
- name: Build and Test - name: Build and Test
image: ubuntu:19.04 image: ubuntu:18.04
environment: environment:
CC: gcc CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
@ -32,7 +32,7 @@ platform:
steps: steps:
- name: Build and Test - name: Build and Test
image: ubuntu:19.04 image: ubuntu:18.04
environment: environment:
CC: gcc CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
@ -152,7 +152,31 @@ platform:
steps: steps:
- name: Build and Test - name: Build and Test
image: ubuntu:19.04 image: ubuntu:18.04
environment:
CC: gcc
COMMON_FLAGS: 'USE_OPENMP=1'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl python g++
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
- make -C cpp_thread_test dgemm_tester
---
kind: pipeline
name: epyc_native_test
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
environment: environment:
CC: gcc CC: gcc
COMMON_FLAGS: 'USE_OPENMP=1' COMMON_FLAGS: 'USE_OPENMP=1'

View File

@ -6,8 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
@ -140,6 +143,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **ThunderX**: Optimized some Level-1 functions - **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
- **TSV110**: Optimized some Level-3 helper functions - **TSV110**: Optimized some Level-3 helper functions
- **EMAG 8180**: preliminary support based on A57
#### PPC/PPC64 #### PPC/PPC64
@ -154,11 +158,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
### Support for multiple targets in a single library ### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake. OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default. For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
common code in the library, usually you will want to set this to the oldest model you expect to encounter. common code in the library, usually you will want to set this to the oldest model you expect to encounter.
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.

View File

@ -193,14 +193,14 @@ int main(int argc, char *argv[]){
a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) { for(i = j + 1; i < m; i++) {
a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; a[((long)i + (long)j * (long)m) * 2 + 0] = 0;
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
} }
} }
} else { } else {
for (j = 0; j < m; j++) { for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) { for(i = 0; i < j; i++) {
a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
} }

View File

@ -43,6 +43,7 @@
#define MB asm("mb") #define MB asm("mb")
#define WMB asm("wmb") #define WMB asm("wmb")
#define RMB asm("rmb")
static void __inline blas_lock(unsigned long *address){ static void __inline blas_lock(unsigned long *address){
#ifndef __DECC #ifndef __DECC

View File

@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MB #define MB
#define WMB #define WMB
#define RMB
#else #else
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define RMB __asm__ __volatile__ ("dmb ish" : : : "memory")
#endif #endif

View File

@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory")
#define INLINE inline #define INLINE inline

View File

@ -47,6 +47,7 @@
#define MB #define MB
#define WMB #define WMB
#define RMB
#ifdef __ECC #ifdef __ECC
#include <ia64intrin.h> #include <ia64intrin.h>

View File

@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MB __sync_synchronize() #define MB __sync_synchronize()
#define WMB __sync_synchronize() #define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline #define INLINE inline

View File

@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MB __sync_synchronize() #define MB __sync_synchronize()
#define WMB __sync_synchronize() #define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline #define INLINE inline

View File

@ -71,9 +71,11 @@
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory") #define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory")
#define RMB __asm__ __volatile__ ("eieio":::"memory")
#else #else
#define MB __asm__ __volatile__ ("sync") #define MB __asm__ __volatile__ ("sync")
#define WMB __asm__ __volatile__ ("sync") #define WMB __asm__ __volatile__ ("sync")
#define RMB __asm__ __volatile__ ("sync")
#endif #endif
#define INLINE inline #define INLINE inline

View File

@ -41,6 +41,7 @@
#define MB __asm__ __volatile__ ("nop") #define MB __asm__ __volatile__ ("nop")
#define WMB __asm__ __volatile__ ("nop") #define WMB __asm__ __volatile__ ("nop")
#define RMB __asm__ __volatile__ ("nop")
#ifndef ASSEMBLER #ifndef ASSEMBLER

View File

@ -47,6 +47,7 @@
#define MB #define MB
#define WMB #define WMB
#define RMB
#ifdef C_SUN #ifdef C_SUN
#define __asm__ __asm #define __asm__ __asm

View File

@ -63,13 +63,16 @@
#ifdef __GNUC__ #ifdef __GNUC__
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) #define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define RMB
#else #else
#define MB do {} while (0) #define MB do {} while (0)
#define WMB do {} while (0) #define WMB do {} while (0)
#define RMB
#endif #endif
static void __inline blas_lock(volatile BLASULONG *address){ static void __inline blas_lock(volatile BLASULONG *address){
#ifndef C_MSVC #ifndef C_MSVC
int ret; int ret;
#else #else

View File

@ -34,9 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define COMMON_ZARCH #define COMMON_ZARCH
#define MB #define MB
//__asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB #define WMB
//__asm__ __volatile__ ("dmb ishst" : : : "memory") #define RMB
#define INLINE inline #define INLINE inline

View File

@ -2741,6 +2741,7 @@ void *blas_memory_alloc(int procpos){
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif #endif
do { do {
RMB;
#if defined(USE_OPENMP) #if defined(USE_OPENMP)
if (!memory[position].used) { if (!memory[position].used) {
blas_lock(&memory[position].lock); blas_lock(&memory[position].lock);

View File

@ -54,7 +54,7 @@
#ifdef OPTERON #ifdef OPTERON
#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG
#else #else
#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #define LOAD(OFFSET, ADDR, REG) movups OFFSET(ADDR), REG
#endif #endif
PROLOGUE PROLOGUE
@ -104,14 +104,14 @@
sarq $4, %rax sarq $4, %rax
jle .L13 jle .L13
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movaps -14 * SIZE(X), %xmm1 movups -14 * SIZE(X), %xmm1
movaps -12 * SIZE(X), %xmm2 movups -12 * SIZE(X), %xmm2
movaps -10 * SIZE(X), %xmm3 movups -10 * SIZE(X), %xmm3
movaps -8 * SIZE(X), %xmm4 movups -8 * SIZE(X), %xmm4
movaps -6 * SIZE(X), %xmm5 movups -6 * SIZE(X), %xmm5
movaps -4 * SIZE(X), %xmm6 movups -4 * SIZE(X), %xmm6
movaps -2 * SIZE(X), %xmm7 movups -2 * SIZE(X), %xmm7
decq %rax decq %rax
jle .L12 jle .L12
@ -122,36 +122,36 @@
PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif #endif
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
LOAD( 0 * SIZE, X, %xmm0) LOAD( 0 * SIZE, X, %xmm0)
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
LOAD( 2 * SIZE, X, %xmm1) LOAD( 2 * SIZE, X, %xmm1)
#ifdef PREFETCH #ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif #endif
movaps %xmm2, -12 * SIZE(Y) movups %xmm2, -12 * SIZE(Y)
LOAD( 4 * SIZE, X, %xmm2) LOAD( 4 * SIZE, X, %xmm2)
movaps %xmm3, -10 * SIZE(Y) movups %xmm3, -10 * SIZE(Y)
LOAD( 6 * SIZE, X, %xmm3) LOAD( 6 * SIZE, X, %xmm3)
#if defined(PREFETCHW) && !defined(FETCH128) #if defined(PREFETCHW) && !defined(FETCH128)
PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif #endif
movaps %xmm4, -8 * SIZE(Y) movups %xmm4, -8 * SIZE(Y)
LOAD( 8 * SIZE, X, %xmm4) LOAD( 8 * SIZE, X, %xmm4)
movaps %xmm5, -6 * SIZE(Y) movups %xmm5, -6 * SIZE(Y)
LOAD(10 * SIZE, X, %xmm5) LOAD(10 * SIZE, X, %xmm5)
#if defined(PREFETCH) && !defined(FETCH128) #if defined(PREFETCH) && !defined(FETCH128)
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif #endif
movaps %xmm6, -4 * SIZE(Y) movups %xmm6, -4 * SIZE(Y)
LOAD(12 * SIZE, X, %xmm6) LOAD(12 * SIZE, X, %xmm6)
movaps %xmm7, -2 * SIZE(Y) movups %xmm7, -2 * SIZE(Y)
LOAD(14 * SIZE, X, %xmm7) LOAD(14 * SIZE, X, %xmm7)
subq $-16 * SIZE, Y subq $-16 * SIZE, Y
@ -161,14 +161,14 @@
ALIGN_3 ALIGN_3
.L12: .L12:
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
movaps %xmm2, -12 * SIZE(Y) movups %xmm2, -12 * SIZE(Y)
movaps %xmm3, -10 * SIZE(Y) movups %xmm3, -10 * SIZE(Y)
movaps %xmm4, -8 * SIZE(Y) movups %xmm4, -8 * SIZE(Y)
movaps %xmm5, -6 * SIZE(Y) movups %xmm5, -6 * SIZE(Y)
movaps %xmm6, -4 * SIZE(Y) movups %xmm6, -4 * SIZE(Y)
movaps %xmm7, -2 * SIZE(Y) movups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y subq $-16 * SIZE, Y
subq $-16 * SIZE, X subq $-16 * SIZE, X
@ -179,15 +179,15 @@
jle .L14 jle .L14
ALIGN_3 ALIGN_3
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movaps -14 * SIZE(X), %xmm1 movups -14 * SIZE(X), %xmm1
movaps -12 * SIZE(X), %xmm2 movups -12 * SIZE(X), %xmm2
movaps -10 * SIZE(X), %xmm3 movups -10 * SIZE(X), %xmm3
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
movaps %xmm2, -12 * SIZE(Y) movups %xmm2, -12 * SIZE(Y)
movaps %xmm3, -10 * SIZE(Y) movups %xmm3, -10 * SIZE(Y)
addq $8 * SIZE, X addq $8 * SIZE, X
addq $8 * SIZE, Y addq $8 * SIZE, Y
@ -198,11 +198,11 @@
jle .L15 jle .L15
ALIGN_3 ALIGN_3
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movaps -14 * SIZE(X), %xmm1 movups -14 * SIZE(X), %xmm1
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
addq $4 * SIZE, X addq $4 * SIZE, X
addq $4 * SIZE, Y addq $4 * SIZE, Y
@ -213,8 +213,8 @@
jle .L16 jle .L16
ALIGN_3 ALIGN_3
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
addq $2 * SIZE, X addq $2 * SIZE, X
addq $2 * SIZE, Y addq $2 * SIZE, Y
@ -246,13 +246,13 @@
sarq $4, %rax sarq $4, %rax
jle .L23 jle .L23
movaps -15 * SIZE(X), %xmm1 movups -15 * SIZE(X), %xmm1
movaps -13 * SIZE(X), %xmm2 movups -13 * SIZE(X), %xmm2
movaps -11 * SIZE(X), %xmm3 movups -11 * SIZE(X), %xmm3
movaps -9 * SIZE(X), %xmm4 movups -9 * SIZE(X), %xmm4
movaps -7 * SIZE(X), %xmm5 movups -7 * SIZE(X), %xmm5
movaps -5 * SIZE(X), %xmm6 movups -5 * SIZE(X), %xmm6
movaps -3 * SIZE(X), %xmm7 movups -3 * SIZE(X), %xmm7
decq %rax decq %rax
jle .L22 jle .L22
@ -264,11 +264,11 @@
#endif #endif
SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm1, %xmm0
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
LOAD(-1 * SIZE, X, %xmm0) LOAD(-1 * SIZE, X, %xmm0)
SHUFPD_1 %xmm2, %xmm1 SHUFPD_1 %xmm2, %xmm1
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
LOAD( 1 * SIZE, X, %xmm1) LOAD( 1 * SIZE, X, %xmm1)
#ifdef PREFETCH #ifdef PREFETCH
@ -276,11 +276,11 @@
#endif #endif
SHUFPD_1 %xmm3, %xmm2 SHUFPD_1 %xmm3, %xmm2
movaps %xmm2, -12 * SIZE(Y) movups %xmm2, -12 * SIZE(Y)
LOAD( 3 * SIZE, X, %xmm2) LOAD( 3 * SIZE, X, %xmm2)
SHUFPD_1 %xmm4, %xmm3 SHUFPD_1 %xmm4, %xmm3
movaps %xmm3, -10 * SIZE(Y) movups %xmm3, -10 * SIZE(Y)
LOAD( 5 * SIZE, X, %xmm3) LOAD( 5 * SIZE, X, %xmm3)
#if defined(PREFETCHW) && !defined(FETCH128) #if defined(PREFETCHW) && !defined(FETCH128)
@ -288,11 +288,11 @@
#endif #endif
SHUFPD_1 %xmm5, %xmm4 SHUFPD_1 %xmm5, %xmm4
movaps %xmm4, -8 * SIZE(Y) movups %xmm4, -8 * SIZE(Y)
LOAD( 7 * SIZE, X, %xmm4) LOAD( 7 * SIZE, X, %xmm4)
SHUFPD_1 %xmm6, %xmm5 SHUFPD_1 %xmm6, %xmm5
movaps %xmm5, -6 * SIZE(Y) movups %xmm5, -6 * SIZE(Y)
LOAD( 9 * SIZE, X, %xmm5) LOAD( 9 * SIZE, X, %xmm5)
#if defined(PREFETCH) && !defined(FETCH128) #if defined(PREFETCH) && !defined(FETCH128)
@ -300,11 +300,11 @@
#endif #endif
SHUFPD_1 %xmm7, %xmm6 SHUFPD_1 %xmm7, %xmm6
movaps %xmm6, -4 * SIZE(Y) movups %xmm6, -4 * SIZE(Y)
LOAD(11 * SIZE, X, %xmm6) LOAD(11 * SIZE, X, %xmm6)
SHUFPD_1 %xmm0, %xmm7 SHUFPD_1 %xmm0, %xmm7
movaps %xmm7, -2 * SIZE(Y) movups %xmm7, -2 * SIZE(Y)
LOAD(13 * SIZE, X, %xmm7) LOAD(13 * SIZE, X, %xmm7)
subq $-16 * SIZE, X subq $-16 * SIZE, X
@ -315,26 +315,26 @@
.L22: .L22:
SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm1, %xmm0
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
LOAD(-1 * SIZE, X, %xmm0) LOAD(-1 * SIZE, X, %xmm0)
SHUFPD_1 %xmm2, %xmm1 SHUFPD_1 %xmm2, %xmm1
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
SHUFPD_1 %xmm3, %xmm2 SHUFPD_1 %xmm3, %xmm2
movaps %xmm2, -12 * SIZE(Y) movups %xmm2, -12 * SIZE(Y)
SHUFPD_1 %xmm4, %xmm3 SHUFPD_1 %xmm4, %xmm3
movaps %xmm3, -10 * SIZE(Y) movups %xmm3, -10 * SIZE(Y)
SHUFPD_1 %xmm5, %xmm4 SHUFPD_1 %xmm5, %xmm4
movaps %xmm4, -8 * SIZE(Y) movups %xmm4, -8 * SIZE(Y)
SHUFPD_1 %xmm6, %xmm5 SHUFPD_1 %xmm6, %xmm5
movaps %xmm5, -6 * SIZE(Y) movups %xmm5, -6 * SIZE(Y)
SHUFPD_1 %xmm7, %xmm6 SHUFPD_1 %xmm7, %xmm6
movaps %xmm6, -4 * SIZE(Y) movups %xmm6, -4 * SIZE(Y)
SHUFPD_1 %xmm0, %xmm7 SHUFPD_1 %xmm0, %xmm7
movaps %xmm7, -2 * SIZE(Y) movups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, X subq $-16 * SIZE, X
subq $-16 * SIZE, Y subq $-16 * SIZE, Y
@ -345,24 +345,24 @@
jle .L24 jle .L24
ALIGN_3 ALIGN_3
movaps -15 * SIZE(X), %xmm1 movups -15 * SIZE(X), %xmm1
movaps -13 * SIZE(X), %xmm2 movups -13 * SIZE(X), %xmm2
movaps -11 * SIZE(X), %xmm3 movups -11 * SIZE(X), %xmm3
movaps -9 * SIZE(X), %xmm8 movups -9 * SIZE(X), %xmm8
SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm1, %xmm0
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
SHUFPD_1 %xmm2, %xmm1 SHUFPD_1 %xmm2, %xmm1
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
SHUFPD_1 %xmm3, %xmm2 SHUFPD_1 %xmm3, %xmm2
movaps %xmm2, -12 * SIZE(Y) movups %xmm2, -12 * SIZE(Y)
SHUFPD_1 %xmm8, %xmm3 SHUFPD_1 %xmm8, %xmm3
movaps %xmm3, -10 * SIZE(Y) movups %xmm3, -10 * SIZE(Y)
movaps %xmm8, %xmm0 movups %xmm8, %xmm0
addq $8 * SIZE, X addq $8 * SIZE, X
addq $8 * SIZE, Y addq $8 * SIZE, Y
@ -373,15 +373,15 @@
jle .L25 jle .L25
ALIGN_3 ALIGN_3
movaps -15 * SIZE(X), %xmm1 movups -15 * SIZE(X), %xmm1
movaps -13 * SIZE(X), %xmm2 movups -13 * SIZE(X), %xmm2
SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm1, %xmm0
SHUFPD_1 %xmm2, %xmm1 SHUFPD_1 %xmm2, %xmm1
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
movaps %xmm1, -14 * SIZE(Y) movups %xmm1, -14 * SIZE(Y)
movaps %xmm2, %xmm0 movups %xmm2, %xmm0
addq $4 * SIZE, X addq $4 * SIZE, X
addq $4 * SIZE, Y addq $4 * SIZE, Y
@ -392,10 +392,10 @@
jle .L26 jle .L26
ALIGN_3 ALIGN_3
movaps -15 * SIZE(X), %xmm1 movups -15 * SIZE(X), %xmm1
SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm1, %xmm0
movaps %xmm0, -16 * SIZE(Y) movups %xmm0, -16 * SIZE(Y)
addq $2 * SIZE, X addq $2 * SIZE, X
addq $2 * SIZE, Y addq $2 * SIZE, Y
@ -424,14 +424,14 @@
sarq $4, %rax sarq $4, %rax
jle .L23 jle .L23
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movaps -14 * SIZE(X), %xmm1 movups -14 * SIZE(X), %xmm1
movaps -12 * SIZE(X), %xmm2 movups -12 * SIZE(X), %xmm2
movaps -10 * SIZE(X), %xmm3 movups -10 * SIZE(X), %xmm3
movaps -8 * SIZE(X), %xmm4 movups -8 * SIZE(X), %xmm4
movaps -6 * SIZE(X), %xmm5 movups -6 * SIZE(X), %xmm5
movaps -4 * SIZE(X), %xmm6 movups -4 * SIZE(X), %xmm6
movaps -2 * SIZE(X), %xmm7 movups -2 * SIZE(X), %xmm7
decq %rax decq %rax
jle .L22 jle .L22
@ -515,16 +515,16 @@
jle .L24 jle .L24
ALIGN_3 ALIGN_3
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movlps %xmm0, -16 * SIZE(Y) movlps %xmm0, -16 * SIZE(Y)
movhps %xmm0, -15 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y)
movaps -14 * SIZE(X), %xmm1 movups -14 * SIZE(X), %xmm1
movlps %xmm1, -14 * SIZE(Y) movlps %xmm1, -14 * SIZE(Y)
movhps %xmm1, -13 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y)
movaps -12 * SIZE(X), %xmm2 movups -12 * SIZE(X), %xmm2
movlps %xmm2, -12 * SIZE(Y) movlps %xmm2, -12 * SIZE(Y)
movhps %xmm2, -11 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y)
movaps -10 * SIZE(X), %xmm3 movups -10 * SIZE(X), %xmm3
movlps %xmm3, -10 * SIZE(Y) movlps %xmm3, -10 * SIZE(Y)
movhps %xmm3, -9 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y)
@ -537,10 +537,10 @@
jle .L25 jle .L25
ALIGN_3 ALIGN_3
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movlps %xmm0, -16 * SIZE(Y) movlps %xmm0, -16 * SIZE(Y)
movhps %xmm0, -15 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y)
movaps -14 * SIZE(X), %xmm1 movups -14 * SIZE(X), %xmm1
movlps %xmm1, -14 * SIZE(Y) movlps %xmm1, -14 * SIZE(Y)
movhps %xmm1, -13 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y)
@ -553,7 +553,7 @@
jle .L26 jle .L26
ALIGN_3 ALIGN_3
movaps -16 * SIZE(X), %xmm0 movups -16 * SIZE(X), %xmm0
movlps %xmm0, -16 * SIZE(Y) movlps %xmm0, -16 * SIZE(Y)
movhps %xmm0, -15 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y)