commit
1c1ca2bc0a
30
.drone.yml
30
.drone.yml
|
@ -8,7 +8,7 @@ platform:
|
|||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||
|
@ -32,7 +32,7 @@ platform:
|
|||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
|
||||
|
@ -152,7 +152,31 @@ platform:
|
|||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'USE_OPENMP=1'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl python g++
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C cpp_thread_test dgemm_tester
|
||||
---
|
||||
kind: pipeline
|
||||
name: epyc_native_test
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: amd64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'USE_OPENMP=1'
|
||||
|
|
|
@ -6,8 +6,11 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)
|
||||
|
||||
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||
|
||||
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||
|
@ -140,6 +143,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **ThunderX**: Optimized some Level-1 functions
|
||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||
- **TSV110**: Optimized some Level-3 helper functions
|
||||
- **EMAG 8180**: preliminary support based on A57
|
||||
|
||||
#### PPC/PPC64
|
||||
|
||||
|
@ -154,11 +158,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
|
||||
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
|
||||
|
||||
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
|
||||
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||
|
|
|
@ -193,14 +193,14 @@ int main(int argc, char *argv[]){
|
|||
a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
|
||||
|
||||
for(i = j + 1; i < m; i++) {
|
||||
a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
a[((long)i + (long)j * (long)m) * 2 + 0] = 0;
|
||||
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) {
|
||||
a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
|
||||
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
|
||||
#define MB asm("mb")
|
||||
#define WMB asm("wmb")
|
||||
#define RMB asm("rmb")
|
||||
|
||||
static void __inline blas_lock(unsigned long *address){
|
||||
#ifndef __DECC
|
||||
|
|
|
@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define RMB
|
||||
|
||||
#else
|
||||
|
||||
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
#define RMB __asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
|
||||
#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory")
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@
|
|||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define RMB
|
||||
|
||||
#ifdef __ECC
|
||||
#include <ia64intrin.h>
|
||||
|
|
|
@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
|
|
|
@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
|
|
|
@ -71,9 +71,11 @@
|
|||
#if defined(POWER8) || defined(POWER9)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define RMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#else
|
||||
#define MB __asm__ __volatile__ ("sync")
|
||||
#define WMB __asm__ __volatile__ ("sync")
|
||||
#define RMB __asm__ __volatile__ ("sync")
|
||||
#endif
|
||||
|
||||
#define INLINE inline
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
|
||||
#define MB __asm__ __volatile__ ("nop")
|
||||
#define WMB __asm__ __volatile__ ("nop")
|
||||
#define RMB __asm__ __volatile__ ("nop")
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@
|
|||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define RMB
|
||||
|
||||
#ifdef C_SUN
|
||||
#define __asm__ __asm
|
||||
|
|
|
@ -63,13 +63,16 @@
|
|||
#ifdef __GNUC__
|
||||
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define RMB
|
||||
#else
|
||||
#define MB do {} while (0)
|
||||
#define WMB do {} while (0)
|
||||
#define RMB
|
||||
#endif
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
||||
#ifndef C_MSVC
|
||||
int ret;
|
||||
#else
|
||||
|
|
|
@ -34,9 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define COMMON_ZARCH
|
||||
|
||||
#define MB
|
||||
//__asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB
|
||||
//__asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
#define RMB
|
||||
|
||||
|
||||
#define INLINE inline
|
||||
|
|
|
@ -2741,6 +2741,7 @@ void *blas_memory_alloc(int procpos){
|
|||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
RMB;
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory[position].used) {
|
||||
blas_lock(&memory[position].lock);
|
||||
|
|
|
@ -54,7 +54,7 @@
|
|||
#ifdef OPTERON
|
||||
#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG
|
||||
#else
|
||||
#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
|
||||
#define LOAD(OFFSET, ADDR, REG) movups OFFSET(ADDR), REG
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
@ -104,14 +104,14 @@
|
|||
sarq $4, %rax
|
||||
jle .L13
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movaps -14 * SIZE(X), %xmm1
|
||||
movaps -12 * SIZE(X), %xmm2
|
||||
movaps -10 * SIZE(X), %xmm3
|
||||
movaps -8 * SIZE(X), %xmm4
|
||||
movaps -6 * SIZE(X), %xmm5
|
||||
movaps -4 * SIZE(X), %xmm6
|
||||
movaps -2 * SIZE(X), %xmm7
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movups -14 * SIZE(X), %xmm1
|
||||
movups -12 * SIZE(X), %xmm2
|
||||
movups -10 * SIZE(X), %xmm3
|
||||
movups -8 * SIZE(X), %xmm4
|
||||
movups -6 * SIZE(X), %xmm5
|
||||
movups -4 * SIZE(X), %xmm6
|
||||
movups -2 * SIZE(X), %xmm7
|
||||
|
||||
decq %rax
|
||||
jle .L12
|
||||
|
@ -122,36 +122,36 @@
|
|||
PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
|
||||
#endif
|
||||
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
LOAD( 0 * SIZE, X, %xmm0)
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
LOAD( 2 * SIZE, X, %xmm1)
|
||||
|
||||
#ifdef PREFETCH
|
||||
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||
#endif
|
||||
|
||||
movaps %xmm2, -12 * SIZE(Y)
|
||||
movups %xmm2, -12 * SIZE(Y)
|
||||
LOAD( 4 * SIZE, X, %xmm2)
|
||||
movaps %xmm3, -10 * SIZE(Y)
|
||||
movups %xmm3, -10 * SIZE(Y)
|
||||
LOAD( 6 * SIZE, X, %xmm3)
|
||||
|
||||
#if defined(PREFETCHW) && !defined(FETCH128)
|
||||
PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
|
||||
#endif
|
||||
|
||||
movaps %xmm4, -8 * SIZE(Y)
|
||||
movups %xmm4, -8 * SIZE(Y)
|
||||
LOAD( 8 * SIZE, X, %xmm4)
|
||||
movaps %xmm5, -6 * SIZE(Y)
|
||||
movups %xmm5, -6 * SIZE(Y)
|
||||
LOAD(10 * SIZE, X, %xmm5)
|
||||
|
||||
#if defined(PREFETCH) && !defined(FETCH128)
|
||||
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
|
||||
#endif
|
||||
|
||||
movaps %xmm6, -4 * SIZE(Y)
|
||||
movups %xmm6, -4 * SIZE(Y)
|
||||
LOAD(12 * SIZE, X, %xmm6)
|
||||
movaps %xmm7, -2 * SIZE(Y)
|
||||
movups %xmm7, -2 * SIZE(Y)
|
||||
LOAD(14 * SIZE, X, %xmm7)
|
||||
|
||||
subq $-16 * SIZE, Y
|
||||
|
@ -161,14 +161,14 @@
|
|||
ALIGN_3
|
||||
|
||||
.L12:
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movaps %xmm2, -12 * SIZE(Y)
|
||||
movaps %xmm3, -10 * SIZE(Y)
|
||||
movaps %xmm4, -8 * SIZE(Y)
|
||||
movaps %xmm5, -6 * SIZE(Y)
|
||||
movaps %xmm6, -4 * SIZE(Y)
|
||||
movaps %xmm7, -2 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm2, -12 * SIZE(Y)
|
||||
movups %xmm3, -10 * SIZE(Y)
|
||||
movups %xmm4, -8 * SIZE(Y)
|
||||
movups %xmm5, -6 * SIZE(Y)
|
||||
movups %xmm6, -4 * SIZE(Y)
|
||||
movups %xmm7, -2 * SIZE(Y)
|
||||
|
||||
subq $-16 * SIZE, Y
|
||||
subq $-16 * SIZE, X
|
||||
|
@ -179,15 +179,15 @@
|
|||
jle .L14
|
||||
ALIGN_3
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movaps -14 * SIZE(X), %xmm1
|
||||
movaps -12 * SIZE(X), %xmm2
|
||||
movaps -10 * SIZE(X), %xmm3
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movups -14 * SIZE(X), %xmm1
|
||||
movups -12 * SIZE(X), %xmm2
|
||||
movups -10 * SIZE(X), %xmm3
|
||||
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movaps %xmm2, -12 * SIZE(Y)
|
||||
movaps %xmm3, -10 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm2, -12 * SIZE(Y)
|
||||
movups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
addq $8 * SIZE, X
|
||||
addq $8 * SIZE, Y
|
||||
|
@ -198,11 +198,11 @@
|
|||
jle .L15
|
||||
ALIGN_3
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movaps -14 * SIZE(X), %xmm1
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movups -14 * SIZE(X), %xmm1
|
||||
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
|
||||
addq $4 * SIZE, X
|
||||
addq $4 * SIZE, Y
|
||||
|
@ -213,8 +213,8 @@
|
|||
jle .L16
|
||||
ALIGN_3
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
|
||||
addq $2 * SIZE, X
|
||||
addq $2 * SIZE, Y
|
||||
|
@ -246,13 +246,13 @@
|
|||
sarq $4, %rax
|
||||
jle .L23
|
||||
|
||||
movaps -15 * SIZE(X), %xmm1
|
||||
movaps -13 * SIZE(X), %xmm2
|
||||
movaps -11 * SIZE(X), %xmm3
|
||||
movaps -9 * SIZE(X), %xmm4
|
||||
movaps -7 * SIZE(X), %xmm5
|
||||
movaps -5 * SIZE(X), %xmm6
|
||||
movaps -3 * SIZE(X), %xmm7
|
||||
movups -15 * SIZE(X), %xmm1
|
||||
movups -13 * SIZE(X), %xmm2
|
||||
movups -11 * SIZE(X), %xmm3
|
||||
movups -9 * SIZE(X), %xmm4
|
||||
movups -7 * SIZE(X), %xmm5
|
||||
movups -5 * SIZE(X), %xmm6
|
||||
movups -3 * SIZE(X), %xmm7
|
||||
|
||||
decq %rax
|
||||
jle .L22
|
||||
|
@ -264,11 +264,11 @@
|
|||
#endif
|
||||
|
||||
SHUFPD_1 %xmm1, %xmm0
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
LOAD(-1 * SIZE, X, %xmm0)
|
||||
|
||||
SHUFPD_1 %xmm2, %xmm1
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
LOAD( 1 * SIZE, X, %xmm1)
|
||||
|
||||
#ifdef PREFETCH
|
||||
|
@ -276,11 +276,11 @@
|
|||
#endif
|
||||
|
||||
SHUFPD_1 %xmm3, %xmm2
|
||||
movaps %xmm2, -12 * SIZE(Y)
|
||||
movups %xmm2, -12 * SIZE(Y)
|
||||
LOAD( 3 * SIZE, X, %xmm2)
|
||||
|
||||
SHUFPD_1 %xmm4, %xmm3
|
||||
movaps %xmm3, -10 * SIZE(Y)
|
||||
movups %xmm3, -10 * SIZE(Y)
|
||||
LOAD( 5 * SIZE, X, %xmm3)
|
||||
|
||||
#if defined(PREFETCHW) && !defined(FETCH128)
|
||||
|
@ -288,11 +288,11 @@
|
|||
#endif
|
||||
|
||||
SHUFPD_1 %xmm5, %xmm4
|
||||
movaps %xmm4, -8 * SIZE(Y)
|
||||
movups %xmm4, -8 * SIZE(Y)
|
||||
LOAD( 7 * SIZE, X, %xmm4)
|
||||
|
||||
SHUFPD_1 %xmm6, %xmm5
|
||||
movaps %xmm5, -6 * SIZE(Y)
|
||||
movups %xmm5, -6 * SIZE(Y)
|
||||
LOAD( 9 * SIZE, X, %xmm5)
|
||||
|
||||
#if defined(PREFETCH) && !defined(FETCH128)
|
||||
|
@ -300,11 +300,11 @@
|
|||
#endif
|
||||
|
||||
SHUFPD_1 %xmm7, %xmm6
|
||||
movaps %xmm6, -4 * SIZE(Y)
|
||||
movups %xmm6, -4 * SIZE(Y)
|
||||
LOAD(11 * SIZE, X, %xmm6)
|
||||
|
||||
SHUFPD_1 %xmm0, %xmm7
|
||||
movaps %xmm7, -2 * SIZE(Y)
|
||||
movups %xmm7, -2 * SIZE(Y)
|
||||
LOAD(13 * SIZE, X, %xmm7)
|
||||
|
||||
subq $-16 * SIZE, X
|
||||
|
@ -315,26 +315,26 @@
|
|||
|
||||
.L22:
|
||||
SHUFPD_1 %xmm1, %xmm0
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
LOAD(-1 * SIZE, X, %xmm0)
|
||||
|
||||
SHUFPD_1 %xmm2, %xmm1
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
|
||||
SHUFPD_1 %xmm3, %xmm2
|
||||
movaps %xmm2, -12 * SIZE(Y)
|
||||
movups %xmm2, -12 * SIZE(Y)
|
||||
SHUFPD_1 %xmm4, %xmm3
|
||||
movaps %xmm3, -10 * SIZE(Y)
|
||||
movups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
SHUFPD_1 %xmm5, %xmm4
|
||||
movaps %xmm4, -8 * SIZE(Y)
|
||||
movups %xmm4, -8 * SIZE(Y)
|
||||
SHUFPD_1 %xmm6, %xmm5
|
||||
movaps %xmm5, -6 * SIZE(Y)
|
||||
movups %xmm5, -6 * SIZE(Y)
|
||||
|
||||
SHUFPD_1 %xmm7, %xmm6
|
||||
movaps %xmm6, -4 * SIZE(Y)
|
||||
movups %xmm6, -4 * SIZE(Y)
|
||||
SHUFPD_1 %xmm0, %xmm7
|
||||
movaps %xmm7, -2 * SIZE(Y)
|
||||
movups %xmm7, -2 * SIZE(Y)
|
||||
|
||||
subq $-16 * SIZE, X
|
||||
subq $-16 * SIZE, Y
|
||||
|
@ -345,24 +345,24 @@
|
|||
jle .L24
|
||||
ALIGN_3
|
||||
|
||||
movaps -15 * SIZE(X), %xmm1
|
||||
movaps -13 * SIZE(X), %xmm2
|
||||
movaps -11 * SIZE(X), %xmm3
|
||||
movaps -9 * SIZE(X), %xmm8
|
||||
movups -15 * SIZE(X), %xmm1
|
||||
movups -13 * SIZE(X), %xmm2
|
||||
movups -11 * SIZE(X), %xmm3
|
||||
movups -9 * SIZE(X), %xmm8
|
||||
|
||||
SHUFPD_1 %xmm1, %xmm0
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
|
||||
SHUFPD_1 %xmm2, %xmm1
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
|
||||
SHUFPD_1 %xmm3, %xmm2
|
||||
movaps %xmm2, -12 * SIZE(Y)
|
||||
movups %xmm2, -12 * SIZE(Y)
|
||||
|
||||
SHUFPD_1 %xmm8, %xmm3
|
||||
movaps %xmm3, -10 * SIZE(Y)
|
||||
movups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
movaps %xmm8, %xmm0
|
||||
movups %xmm8, %xmm0
|
||||
|
||||
addq $8 * SIZE, X
|
||||
addq $8 * SIZE, Y
|
||||
|
@ -373,15 +373,15 @@
|
|||
jle .L25
|
||||
ALIGN_3
|
||||
|
||||
movaps -15 * SIZE(X), %xmm1
|
||||
movaps -13 * SIZE(X), %xmm2
|
||||
movups -15 * SIZE(X), %xmm1
|
||||
movups -13 * SIZE(X), %xmm2
|
||||
|
||||
SHUFPD_1 %xmm1, %xmm0
|
||||
SHUFPD_1 %xmm2, %xmm1
|
||||
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movaps %xmm1, -14 * SIZE(Y)
|
||||
movaps %xmm2, %xmm0
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm1, -14 * SIZE(Y)
|
||||
movups %xmm2, %xmm0
|
||||
|
||||
addq $4 * SIZE, X
|
||||
addq $4 * SIZE, Y
|
||||
|
@ -392,10 +392,10 @@
|
|||
jle .L26
|
||||
ALIGN_3
|
||||
|
||||
movaps -15 * SIZE(X), %xmm1
|
||||
movups -15 * SIZE(X), %xmm1
|
||||
SHUFPD_1 %xmm1, %xmm0
|
||||
|
||||
movaps %xmm0, -16 * SIZE(Y)
|
||||
movups %xmm0, -16 * SIZE(Y)
|
||||
|
||||
addq $2 * SIZE, X
|
||||
addq $2 * SIZE, Y
|
||||
|
@ -424,14 +424,14 @@
|
|||
sarq $4, %rax
|
||||
jle .L23
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movaps -14 * SIZE(X), %xmm1
|
||||
movaps -12 * SIZE(X), %xmm2
|
||||
movaps -10 * SIZE(X), %xmm3
|
||||
movaps -8 * SIZE(X), %xmm4
|
||||
movaps -6 * SIZE(X), %xmm5
|
||||
movaps -4 * SIZE(X), %xmm6
|
||||
movaps -2 * SIZE(X), %xmm7
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movups -14 * SIZE(X), %xmm1
|
||||
movups -12 * SIZE(X), %xmm2
|
||||
movups -10 * SIZE(X), %xmm3
|
||||
movups -8 * SIZE(X), %xmm4
|
||||
movups -6 * SIZE(X), %xmm5
|
||||
movups -4 * SIZE(X), %xmm6
|
||||
movups -2 * SIZE(X), %xmm7
|
||||
|
||||
decq %rax
|
||||
jle .L22
|
||||
|
@ -515,16 +515,16 @@
|
|||
jle .L24
|
||||
ALIGN_3
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movlps %xmm0, -16 * SIZE(Y)
|
||||
movhps %xmm0, -15 * SIZE(Y)
|
||||
movaps -14 * SIZE(X), %xmm1
|
||||
movups -14 * SIZE(X), %xmm1
|
||||
movlps %xmm1, -14 * SIZE(Y)
|
||||
movhps %xmm1, -13 * SIZE(Y)
|
||||
movaps -12 * SIZE(X), %xmm2
|
||||
movups -12 * SIZE(X), %xmm2
|
||||
movlps %xmm2, -12 * SIZE(Y)
|
||||
movhps %xmm2, -11 * SIZE(Y)
|
||||
movaps -10 * SIZE(X), %xmm3
|
||||
movups -10 * SIZE(X), %xmm3
|
||||
movlps %xmm3, -10 * SIZE(Y)
|
||||
movhps %xmm3, -9 * SIZE(Y)
|
||||
|
||||
|
@ -537,10 +537,10 @@
|
|||
jle .L25
|
||||
ALIGN_3
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movlps %xmm0, -16 * SIZE(Y)
|
||||
movhps %xmm0, -15 * SIZE(Y)
|
||||
movaps -14 * SIZE(X), %xmm1
|
||||
movups -14 * SIZE(X), %xmm1
|
||||
movlps %xmm1, -14 * SIZE(Y)
|
||||
movhps %xmm1, -13 * SIZE(Y)
|
||||
|
||||
|
@ -553,7 +553,7 @@
|
|||
jle .L26
|
||||
ALIGN_3
|
||||
|
||||
movaps -16 * SIZE(X), %xmm0
|
||||
movups -16 * SIZE(X), %xmm0
|
||||
movlps %xmm0, -16 * SIZE(Y)
|
||||
movhps %xmm0, -15 * SIZE(Y)
|
||||
|
||||
|
|
Loading…
Reference in New Issue