Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2014-10-13 17:10:41 +08:00
commit 7e4e195e82
134 changed files with 28182 additions and 5277 deletions

View File

@ -1,6 +1,21 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
==================================================================== ====================================================================
Version 0.2.10 Version 0.2.12
13-Oct-2014
common:
* Added CBLAS interface for ?omatcopy and ?imatcopy.
* Enable ?gemm3m functions.
* Added benchmark for ?gemm3m.
* Optimized multithreading lower limits.
* Disabled SYMM3M and HEMM3M functions
because of segment violations.
x86/x86-64:
* Improved axpy and symv performance on AMD Bulldozer.
* Improved gemv performance on modern Intel and AMD CPUs.
====================================================================
Version 0.2.11
18-Aug-2014 18-Aug-2014
common: common:
* Added some benchmark codes. * Added some benchmark codes.

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.2.11 VERSION = 0.2.12
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double
endif endif
ifeq ($(C_COMPILER), CLANG) ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1 EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double FCOMMON_OPT += -m128bit-long-double
endif endif
endif endif
@ -350,6 +350,7 @@ ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -wd981 CCOMMON_OPT += -wd981
endif endif
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
# ifeq logical or. GCC or LSB # ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))

View File

@ -55,16 +55,23 @@ Please read GotoBLAS_01Readme.txt
#### x86/x86-64: #### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge). - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Used Bulldozer codes. - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
#### MIPS64: #### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental - **ICT Loongson 3B**: Experimental
#### ARM:
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
#### ARM64:
- **ARMV8**: Experimental
### Support OS: ### Support OS:
- **GNU/Linux** - **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
@ -116,8 +123,8 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Contributing ## Contributing

View File

@ -19,6 +19,7 @@ PENRYN
DUNNINGTON DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE SANDYBRIDGE
HASWELL
ATOM ATOM
b)AMD CPU: b)AMD CPU:
@ -30,6 +31,7 @@ SHANGHAI
ISTANBUL ISTANBUL
BOBCAT BOBCAT
BULLDOZER BULLDOZER
PILEDRIVER
c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC
@ -59,3 +61,7 @@ ITANIUM2
SPARC SPARC
SPARCV7 SPARCV7
6.ARM CPU:
ARMV7
ARMV6
ARMV5

View File

@ -35,7 +35,10 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto \ sger.goto dger.goto \
ssymv.goto dsymv.goto \ sdot.goto ddot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
chemv.goto zhemv.goto \
chemm.goto zhemm.goto \ chemm.goto zhemm.goto \
cherk.goto zherk.goto \ cherk.goto zherk.goto \
cher2k.goto zher2k.goto \ cher2k.goto zher2k.goto \
@ -53,7 +56,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
sger.acml dger.acml \ sger.acml dger.acml \
ssymv.acml dsymv.acml \ sdot.acml ddot.acml \
saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
ssymv.acml dsymv.acml csymv.acml zsymv.acml \
chemv.acml zhemv.acml \
chemm.acml zhemm.acml \ chemm.acml zhemm.acml \
cherk.acml zherk.acml \ cherk.acml zherk.acml \
cher2k.acml zher2k.acml \ cher2k.acml zher2k.acml \
@ -71,7 +77,10 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
sger.atlas dger.atlas \ sger.atlas dger.atlas \
ssymv.atlas dsymv.atlas \ sdot.atlas ddot.atlas \
saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
chemv.atlas zhemv.atlas \
chemm.acml zhemm.acml \ chemm.acml zhemm.acml \
chemm.atlas zhemm.atlas \ chemm.atlas zhemm.atlas \
cherk.atlas zherk.atlas \ cherk.atlas zherk.atlas \
@ -90,7 +99,10 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
sger.mkl dger.mkl \ sger.mkl dger.mkl \
ssymv.mkl dsymv.mkl \ sdot.mkl ddot.mkl \
saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
chemv.mkl zhemv.mkl \
chemm.mkl zhemm.mkl \ chemm.mkl zhemm.mkl \
cherk.mkl zherk.mkl \ cherk.mkl zherk.mkl \
cher2k.mkl zher2k.mkl \ cher2k.mkl zher2k.mkl \
@ -100,7 +112,12 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
all :: goto atlas acml mkl
goto_3m :: cgemm3m.goto zgemm3m.goto
mkl_3m :: cgemm3m.mkl zgemm3m.mkl
all :: goto mkl atlas acml
##################################### Slinpack #################################################### ##################################### Slinpack ####################################################
slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
@ -732,6 +749,32 @@ dsymv.atlas : dsymv.$(SUFFIX)
dsymv.mkl : dsymv.$(SUFFIX) dsymv.mkl : dsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Csymv ####################################################
csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
csymv.acml : csymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
csymv.atlas : csymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
csymv.mkl : csymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dsymv ####################################################
zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zsymv.acml : zsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zsymv.atlas : zsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zsymv.mkl : zsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sgeev #################################################### ##################################### Sgeev ####################################################
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
@ -896,6 +939,131 @@ zpotrf.atlas : zpotrf.$(SUFFIX)
zpotrf.mkl : zpotrf.$(SUFFIX) zpotrf.mkl : zpotrf.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Chemv ####################################################
chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
chemv.acml : chemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
chemv.atlas : chemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
chemv.mkl : chemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zhemv ####################################################
zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zhemv.acml : zhemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zhemv.atlas : zhemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zhemv.mkl : zhemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sdot ####################################################
sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
sdot.acml : sdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
sdot.atlas : sdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
sdot.mkl : sdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ddot ####################################################
ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
ddot.acml : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ddot.atlas : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ddot.mkl : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
saxpy.acml : saxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
saxpy.atlas : saxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
saxpy.mkl : saxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Daxpy ####################################################
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
daxpy.acml : daxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
daxpy.atlas : daxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
daxpy.mkl : daxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Caxpy ####################################################
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
caxpy.acml : caxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
caxpy.atlas : caxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
caxpy.mkl : caxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zaxpy ####################################################
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zaxpy.acml : zaxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zaxpy.atlas : zaxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zaxpy.mkl : zaxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Cgemm3m ####################################################
cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
cgemm3m.mkl : cgemm3m.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zgemm3m ####################################################
zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zgemm3m.mkl : zgemm3m.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
################################################################################################### ###################################################################################################
@ -1037,6 +1205,12 @@ ssymv.$(SUFFIX) : symv.c
dsymv.$(SUFFIX) : symv.c dsymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
csymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zsymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sgeev.$(SUFFIX) : geev.c sgeev.$(SUFFIX) : geev.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@ -1073,8 +1247,35 @@ cpotrf.$(SUFFIX) : potrf.c
zpotrf.$(SUFFIX) : potrf.c zpotrf.$(SUFFIX) : potrf.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
chemv.$(SUFFIX) : hemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zhemv.$(SUFFIX) : hemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
ddot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
daxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
caxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zaxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
cgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
clean :: clean ::

201
benchmark/axpy.c Normal file
View File

@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AXPY
#ifdef COMPLEX
#ifdef DOUBLE
#define AXPY BLASFUNC(zaxpy)
#else
#define AXPY BLASFUNC(caxpy)
#endif
#else
#ifdef DOUBLE
#define AXPY BLASFUNC(daxpy)
#else
#define AXPY BLASFUNC(saxpy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/dot.c Normal file
View File

@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(ddot)
#else
#define DOT BLASFUNC(sdot)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);

212
benchmark/gemm3m.c Normal file
View File

@ -0,0 +1,212 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMM
#ifndef COMPLEX
#ifdef DOUBLE
#define GEMM BLASFUNC(dgemm)
#else
#define GEMM BLASFUNC(sgemm)
#endif
#else
#ifdef DOUBLE
#define GEMM BLASFUNC(zgemm3m)
#else
#define GEMM BLASFUNC(cgemm3m)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
p = getenv("OPENBLAS_LOOPS");
if ( p != NULL )
loops = atoi(p);
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
blasint inc_x=1,inc_y=1; blasint inc_x=1,inc_y=1;
blasint n=0; blasint n=0;
int has_param_n = 0; int has_param_n = 0;
int has_param_m = 0;
int loops = 1; int loops = 1;
int l; int l;
char *p; char *p;
@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;}
int tomax = to;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_PARAM_N"))) { if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p); n = atoi(p);
if ((n>0) && (n<=to)) has_param_n = 1; if ((n>0)) has_param_n = 1;
if ( n > tomax ) tomax = n;
} }
if ( has_param_n == 0 )
if ((p = getenv("OPENBLAS_PARAM_M"))) {
m = atoi(p);
if ((m>0)) has_param_m = 1;
if ( m > tomax ) tomax = m;
}
if ( has_param_n == 1 )
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
else
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);
} }
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);
} }
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);
} }
@ -177,50 +187,80 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, " SIZE Flops\n"); fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) if (has_param_m == 0)
{ {
timeg=0; for(m = from; m <= to; m += step)
{
timeg=0;
if ( has_param_n == 0 ) n = m;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
if ( has_param_n == 0 ) n = m; for (l=0; l<loops; l++)
{
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
for(i = 0; i < n * COMPSIZE; i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; }
} gettimeofday( &start, (struct timezone *)0);
} GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
for (l=0; l<loops; l++) timeg /= loops;
{
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ }
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; }
} else
gettimeofday( &start, (struct timezone *)0); {
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); for(n = from; n <= to; n += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &stop, (struct timezone *)0); for (l=0; l<loops; l++)
{
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
timeg += time1; for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
} }
timeg /= loops; timeg /= loops;
fprintf(stderr, fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
}
} }
return 0; return 0;

208
benchmark/hemv.c Normal file
View File

@ -0,0 +1,208 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HEMV
#ifdef DOUBLE
#define HEMV BLASFUNC(zhemv)
#else
#define HEMV BLASFUNC(chemv)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

42
benchmark/tplot-header Normal file
View File

@ -0,0 +1,42 @@
# **********************************************************************************
# Copyright (c) 2014, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# **********************************************************************************
set term x11 font sans;
set ylabel "MFlops";
set xlabel "Size";
set grid xtics;
set grid ytics;
set key left;
set timestamp "generated on %Y-%m-%d by `whoami`"
set title "Sgemv\nTRANS=T\nBulldozer"
plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
set output "print.png";
show title;
show plot;
show output;

23
cblas.h
View File

@ -243,8 +243,13 @@ void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
@ -318,6 +323,24 @@ void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLA
void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy); void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy);
void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a,
OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb);
void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a,
OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb);
void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a,
OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif /* __cplusplus */ #endif /* __cplusplus */

View File

@ -231,8 +231,12 @@ void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
@ -306,7 +310,23 @@ void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, f
void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy); void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy);
void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a,
blasint clda, float *b, blasint cldb);
void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a,
blasint clda, double *b, blasint cldb);
void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a,
blasint clda, void *b, blasint cldb);
void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a,
blasint clda, void *b, blasint cldb);
void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a,
blasint clda, blasint cldb);
void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a,
blasint clda, blasint cldb);
void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a,
blasint clda, blasint cldb);
void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
blasint clda, blasint cldb);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif /* __cplusplus */ #endif /* __cplusplus */

View File

@ -435,6 +435,9 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int cgemm3m_p, cgemm3m_q, cgemm3m_r;
int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn;
int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
@ -595,6 +598,9 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int zgemm3m_p, zgemm3m_q, zgemm3m_r;
int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn;
int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
@ -757,6 +763,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int xgemm3m_p, xgemm3m_q, xgemm3m_r;
int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn;
int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
@ -900,6 +909,27 @@ extern gotoblas_t *gotoblas;
#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n #define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n
#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn #define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn
#define CGEMM3M_P gotoblas -> cgemm3m_p
#define CGEMM3M_Q gotoblas -> cgemm3m_q
#define CGEMM3M_R gotoblas -> cgemm3m_r
#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m
#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n
#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn
#define ZGEMM3M_P gotoblas -> zgemm3m_p
#define ZGEMM3M_Q gotoblas -> zgemm3m_q
#define ZGEMM3M_R gotoblas -> zgemm3m_r
#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m
#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n
#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn
#define XGEMM3M_P gotoblas -> xgemm3m_p
#define XGEMM3M_Q gotoblas -> xgemm3m_q
#define XGEMM3M_R gotoblas -> xgemm3m_r
#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m
#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n
#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn
#else #else
#define DTB_ENTRIES DTB_DEFAULT_ENTRIES #define DTB_ENTRIES DTB_DEFAULT_ENTRIES
@ -972,6 +1002,55 @@ extern gotoblas_t *gotoblas;
#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N #define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N
#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) #define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N))
#ifdef CGEMM3M_DEFAULT_UNROLL_N
#define CGEMM3M_P CGEMM3M_DEFAULT_P
#define CGEMM3M_Q CGEMM3M_DEFAULT_Q
#define CGEMM3M_R CGEMM3M_DEFAULT_R
#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M
#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N
#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N))
#else
#define CGEMM3M_P SGEMM_DEFAULT_P
#define CGEMM3M_Q SGEMM_DEFAULT_Q
#define CGEMM3M_R SGEMM_DEFAULT_R
#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M
#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N
#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N))
#endif
#ifdef ZGEMM3M_DEFAULT_UNROLL_N
#define ZGEMM3M_P ZGEMM3M_DEFAULT_P
#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q
#define ZGEMM3M_R ZGEMM3M_DEFAULT_R
#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M
#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N
#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
#else
#define ZGEMM3M_P DGEMM_DEFAULT_P
#define ZGEMM3M_Q DGEMM_DEFAULT_Q
#define ZGEMM3M_R DGEMM_DEFAULT_R
#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M
#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N
#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
#endif
#define XGEMM3M_P QGEMM_DEFAULT_P
#define XGEMM3M_Q QGEMM_DEFAULT_Q
#define XGEMM3M_R QGEMM_DEFAULT_R
#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M
#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N
#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N))
#endif #endif
#endif #endif
@ -1054,14 +1133,14 @@ extern gotoblas_t *gotoblas;
#endif #endif
#ifdef XDOUBLE #ifdef XDOUBLE
#define GEMM3M_UNROLL_M QGEMM_UNROLL_M #define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M
#define GEMM3M_UNROLL_N QGEMM_UNROLL_N #define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N
#elif defined(DOUBLE) #elif defined(DOUBLE)
#define GEMM3M_UNROLL_M DGEMM_UNROLL_M #define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M
#define GEMM3M_UNROLL_N DGEMM_UNROLL_N #define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N
#else #else
#define GEMM3M_UNROLL_M SGEMM_UNROLL_M #define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M
#define GEMM3M_UNROLL_N SGEMM_UNROLL_N #define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N
#endif #endif
@ -1123,31 +1202,31 @@ extern gotoblas_t *gotoblas;
#ifndef GEMM3M_P #ifndef GEMM3M_P
#ifdef XDOUBLE #ifdef XDOUBLE
#define GEMM3M_P QGEMM_P #define GEMM3M_P XGEMM3M_P
#elif defined(DOUBLE) #elif defined(DOUBLE)
#define GEMM3M_P DGEMM_P #define GEMM3M_P ZGEMM3M_P
#else #else
#define GEMM3M_P SGEMM_P #define GEMM3M_P CGEMM3M_P
#endif #endif
#endif #endif
#ifndef GEMM3M_Q #ifndef GEMM3M_Q
#ifdef XDOUBLE #ifdef XDOUBLE
#define GEMM3M_Q QGEMM_Q #define GEMM3M_Q XGEMM3M_Q
#elif defined(DOUBLE) #elif defined(DOUBLE)
#define GEMM3M_Q DGEMM_Q #define GEMM3M_Q ZGEMM3M_Q
#else #else
#define GEMM3M_Q SGEMM_Q #define GEMM3M_Q CGEMM3M_Q
#endif #endif
#endif #endif
#ifndef GEMM3M_R #ifndef GEMM3M_R
#ifdef XDOUBLE #ifdef XDOUBLE
#define GEMM3M_R QGEMM_R #define GEMM3M_R XGEMM3M_R
#elif defined(DOUBLE) #elif defined(DOUBLE)
#define GEMM3M_R DGEMM_R #define GEMM3M_R ZGEMM3M_R
#else #else
#define GEMM3M_R SGEMM_R #define GEMM3M_R CGEMM3M_R
#endif #endif
#endif #endif

View File

@ -46,6 +46,7 @@
#define __volatile__ #define __volatile__
#endif #endif
/*
#ifdef HAVE_SSE2 #ifdef HAVE_SSE2
#define MB __asm__ __volatile__ ("mfence"); #define MB __asm__ __volatile__ ("mfence");
#define WMB __asm__ __volatile__ ("sfence"); #define WMB __asm__ __volatile__ ("sfence");
@ -53,6 +54,10 @@
#define MB #define MB
#define WMB #define WMB
#endif #endif
*/
#define MB
#define WMB
static void __inline blas_lock(volatile BLASULONG *address){ static void __inline blas_lock(volatile BLASULONG *address){
@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
: "0" (op)); : "0" (op));
} }
/*
#define WHEREAMI #define WHEREAMI
*/
static inline int WhereAmI(void){ static inline int WhereAmI(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
@ -111,6 +118,7 @@ static inline int WhereAmI(void){
return apicid; return apicid;
} }
#ifdef CORE_BARCELONA #ifdef CORE_BARCELONA
#define IFLUSH gotoblas_iflush() #define IFLUSH gotoblas_iflush()
#define IFLUSH_HALF gotoblas_iflush_half() #define IFLUSH_HALF gotoblas_iflush_half()

View File

@ -59,9 +59,16 @@
void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx);
#else #else
static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
#if defined(__i386__) && defined(__PIC__)
__asm__ __volatile__
("mov %%ebx, %%edi;"
"cpuid;"
"xchgl %%ebx, %%edi;"
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
#else
__asm__ __volatile__ __asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
#endif
} }
#endif #endif

View File

@ -74,6 +74,18 @@ else
OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3
endif endif
all3_3m: xzcblat3_3m xccblat3_3m
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
clean :: clean ::
rm -f x* rm -f x*
@ -103,6 +115,9 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3_3m: $(ctestl3o) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
# Double complex # Double complex
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
@ -111,4 +126,9 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3_3m: $(ztestl3o) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
include $(TOPDIR)/Makefile.tail include $(TOPDIR)/Makefile.tail

View File

@ -45,8 +45,238 @@ void F77_c3chke(char * rout) {
F77_xerbla(cblas_rout,&cblas_info); F77_xerbla(cblas_rout,&cblas_info);
} }
if (strncmp( sf,"cblas_cgemm" ,11)==0) {
cblas_rout = "cblas_cgemm" ; if (strncmp( sf,"cblas_cgemm3m" ,13)==0) {
cblas_rout = "cblas_cgemm3" ;
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
} else if (strncmp( sf,"cblas_cgemm" ,11)==0) {
cblas_rout = "cblas_cgemm" ;
cblas_info = 1; cblas_info = 1;
cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,

View File

@ -88,6 +88,7 @@ void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n,
cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc ); b, *ldb, beta, c, *ldc );
} }
void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
@ -563,3 +564,84 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb); a, *lda, b, *ldb);
} }
void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}

2786
ctest/c_cblat3_3m.f Normal file

File diff suppressed because it is too large Load Diff

View File

@ -45,8 +45,242 @@ void F77_z3chke(char * rout) {
F77_xerbla(cblas_rout,&cblas_info); F77_xerbla(cblas_rout,&cblas_info);
} }
if (strncmp( sf,"cblas_zgemm" ,11)==0) {
cblas_rout = "cblas_zgemm" ;
if (strncmp( sf,"cblas_zgemm3m" ,13)==0) {
cblas_rout = "cblas_zgemm3" ;
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
} else if (strncmp( sf,"cblas_zgemm" ,11)==0) {
cblas_rout = "cblas_zgemm" ;
cblas_info = 1; cblas_info = 1;
cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,

View File

@ -562,3 +562,82 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb); a, *lda, b, *ldb);
} }
void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}

2791
ctest/c_zblat3_3m.f Normal file

File diff suppressed because it is too large Load Diff

View File

@ -173,12 +173,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
#define F77_dtrmm cdtrmm_ #define F77_dtrmm cdtrmm_
#define F77_dtrsm cdtrsm_ #define F77_dtrsm cdtrsm_
#define F77_cgemm ccgemm_ #define F77_cgemm ccgemm_
#define F77_cgemm3m ccgemm3m_
#define F77_csymm ccsymm_ #define F77_csymm ccsymm_
#define F77_csyrk ccsyrk_ #define F77_csyrk ccsyrk_
#define F77_csyr2k ccsyr2k_ #define F77_csyr2k ccsyr2k_
#define F77_ctrmm cctrmm_ #define F77_ctrmm cctrmm_
#define F77_ctrsm cctrsm_ #define F77_ctrsm cctrsm_
#define F77_zgemm czgemm_ #define F77_zgemm czgemm_
#define F77_zgemm3m czgemm3m_
#define F77_zsymm czsymm_ #define F77_zsymm czsymm_
#define F77_zsyrk czsyrk_ #define F77_zsyrk czsyrk_
#define F77_zsyr2k czsyr2k_ #define F77_zsyr2k czsyr2k_
@ -333,12 +335,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
#define F77_dtrmm CDTRMM #define F77_dtrmm CDTRMM
#define F77_dtrsm CDTRSM #define F77_dtrsm CDTRSM
#define F77_cgemm CCGEMM #define F77_cgemm CCGEMM
#define F77_cgemm3m CCGEMM3M
#define F77_csymm CCSYMM #define F77_csymm CCSYMM
#define F77_csyrk CCSYRK #define F77_csyrk CCSYRK
#define F77_csyr2k CCSYR2K #define F77_csyr2k CCSYR2K
#define F77_ctrmm CCTRMM #define F77_ctrmm CCTRMM
#define F77_ctrsm CCTRSM #define F77_ctrsm CCTRSM
#define F77_zgemm CZGEMM #define F77_zgemm CZGEMM
#define F77_zgemm3m CZGEMM3M
#define F77_zsymm CZSYMM #define F77_zsymm CZSYMM
#define F77_zsyrk CZSYRK #define F77_zsyrk CZSYRK
#define F77_zsyr2k CZSYR2K #define F77_zsyr2k CZSYR2K
@ -493,12 +497,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
#define F77_dtrmm cdtrmm #define F77_dtrmm cdtrmm
#define F77_dtrsm cdtrsm #define F77_dtrsm cdtrsm
#define F77_cgemm ccgemm #define F77_cgemm ccgemm
#define F77_cgemm3m ccgemm3m
#define F77_csymm ccsymm #define F77_csymm ccsymm
#define F77_csyrk ccsyrk #define F77_csyrk ccsyrk
#define F77_csyr2k ccsyr2k #define F77_csyr2k ccsyr2k
#define F77_ctrmm cctrmm #define F77_ctrmm cctrmm
#define F77_ctrsm cctrsm #define F77_ctrsm cctrsm
#define F77_zgemm czgemm #define F77_zgemm czgemm
#define F77_zgemm3m czgemm3m
#define F77_zsymm czsymm #define F77_zsymm czsymm
#define F77_zsyrk czsyrk #define F77_zsyrk czsyrk
#define F77_zsyr2k czsyr2k #define F77_zsyr2k czsyr2k

22
ctest/cin3_3m Normal file
View File

@ -0,0 +1,22 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
6 NUMBER OF VALUES OF N
0 1 2 3 5 9 35 VALUES OF N
3 NUMBER OF VALUES OF ALPHA
(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
3 NUMBER OF VALUES OF BETA
(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
cblas_cgemm3m T PUT F FOR NO TEST. SAME COLUMNS.
cblas_chemm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_csymm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_ctrmm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_ctrsm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_cherk F PUT F FOR NO TEST. SAME COLUMNS.
cblas_csyrk F PUT F FOR NO TEST. SAME COLUMNS.
cblas_cher2k F PUT F FOR NO TEST. SAME COLUMNS.
cblas_csyr2k F PUT F FOR NO TEST. SAME COLUMNS.

22
ctest/zin3_3m Normal file
View File

@ -0,0 +1,22 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
7 NUMBER OF VALUES OF N
0 1 2 3 5 9 35 VALUES OF N
3 NUMBER OF VALUES OF ALPHA
(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
3 NUMBER OF VALUES OF BETA
(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
cblas_zgemm3m T PUT F FOR NO TEST. SAME COLUMNS.
cblas_zhemm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_zsymm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_ztrmm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_ztrsm F PUT F FOR NO TEST. SAME COLUMNS.
cblas_zherk F PUT F FOR NO TEST. SAME COLUMNS.
cblas_zsyrk F PUT F FOR NO TEST. SAME COLUMNS.
cblas_zher2k F PUT F FOR NO TEST. SAME COLUMNS.
cblas_zsyr2k F PUT F FOR NO TEST. SAME COLUMNS.

View File

@ -4,11 +4,11 @@ include ../../Makefile.system
USE_GEMM3M = 0 USE_GEMM3M = 0
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
USE_GEMM3M = 0 USE_GEMM3M = 1
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
USE_GEMM3M = 0 USE_GEMM3M = 1
endif endif
ifeq ($(ARCH), ia64) ifeq ($(ARCH), ia64)

View File

@ -251,7 +251,11 @@ void blas_set_parameter(void){
env_var_t p; env_var_t p;
int factor; int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
int size = 16;
#else
int size = get_L2_size(); int size = get_L2_size();
#endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
size >>= 7; size >>= 7;

View File

@ -52,7 +52,9 @@
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
); );
@exblasobjs = ( @exblasobjs = (
@ -73,7 +75,7 @@
); );
@gemm3mobjs = ( @gemm3mobjs = (
cgemm3m,zgemm3m
); );

View File

@ -4,11 +4,11 @@ include $(TOPDIR)/Makefile.system
SUPPORT_GEMM3M = 0 SUPPORT_GEMM3M = 0
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
SUPPORT_GEMM3M = 0 SUPPORT_GEMM3M = 1
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
SUPPORT_GEMM3M = 0 SUPPORT_GEMM3M = 1
endif endif
ifeq ($(ARCH), ia64) ifeq ($(ARCH), ia64)
@ -128,9 +128,11 @@ ZBLAS3OBJS = \
ifeq ($(SUPPORT_GEMM3M), 1) ifeq ($(SUPPORT_GEMM3M), 1)
CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) # CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX)
CBLAS3OBJS += cgemm3m.$(SUFFIX)
ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) # ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX)
ZBLAS3OBJS += zgemm3m.$(SUFFIX)
endif endif
@ -267,7 +269,7 @@ CSBLAS2OBJS = \
CSBLAS3OBJS = \ CSBLAS3OBJS = \
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)
CDBLAS1OBJS = \ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
@ -283,7 +285,7 @@ CDBLAS2OBJS = \
CDBLAS3OBJS += \ CDBLAS3OBJS += \
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX)
CCBLAS1OBJS = \ CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
@ -305,7 +307,9 @@ CCBLAS2OBJS = \
CCBLAS3OBJS = \ CCBLAS3OBJS = \
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)
CZBLAS1OBJS = \ CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
@ -327,7 +331,19 @@ CZBLAS2OBJS = \
CZBLAS3OBJS = \ CZBLAS3OBJS = \
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX) cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1)
# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX)
CCBLAS3OBJS += cblas_cgemm3m.$(SUFFIX)
# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX)
CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX)
endif
ifndef NO_CBLAS ifndef NO_CBLAS
@ -1771,6 +1787,13 @@ cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c
cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c
$(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F)
cblas_cgemm3m.$(SUFFIX) cblas_cgemm3m.$(PSUFFIX) : gemm.c
$(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F)
cblas_zgemm3m.$(SUFFIX) cblas_zgemm3m.$(PSUFFIX) : gemm.c
$(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F)
sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
@ -2035,25 +2058,49 @@ cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c
domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_domatcopy.$(SUFFIX) cblas_domatcopy.$(PSUFFIX) : omatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_somatcopy.$(SUFFIX) cblas_somatcopy.$(PSUFFIX) : omatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_comatcopy.$(SUFFIX) cblas_comatcopy.$(PSUFFIX) : zomatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_dimatcopy.$(SUFFIX) cblas_dimatcopy.$(PSUFFIX) : imatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_simatcopy.$(SUFFIX) cblas_simatcopy.$(PSUFFIX) : imatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_cimatcopy.$(SUFFIX) cblas_cimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)

View File

@ -405,49 +405,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#ifndef COMPLEX #ifndef COMPLEX
double MNK = (double) args.m * (double) args.n * (double) args.k; double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (16.0 * 1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1; nthreads_max = 1;
else
{
if ( MNK <= (2.0 * 65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}
#else #else
double MNK = (double) args.m * (double) args.n * (double) args.k; double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1; nthreads_max = 1;
else
{
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}
#endif #endif
args.common = NULL; args.common = NULL;

View File

@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
int nthreads_avail = nthreads_max; int nthreads_avail = nthreads_max;
double MNK = (double) m * (double) n; double MNK = (double) m * (double) n;
if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
nthreads_max = 1; nthreads_max = 1;
if ( nthreads_max > nthreads_avail ) if ( nthreads_max > nthreads_avail )

View File

@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#undef malloc #undef malloc
#undef free #undef free
#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb)
{ {
@ -71,6 +72,28 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'R' ) trans = BlasNoTrans;
if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'T' ) trans = BlasTrans;
if ( Trans == 'C' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans;
#else
void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb)
{
char Order, Trans;
int order=-1,trans=-1;
blasint info = -1;
FLOAT *b;
size_t msize;
blasint *lda, *ldb, *rows, *cols;
FLOAT *alpha;
if ( CORDER == CblasColMajor) order = BlasColMajor;
if ( CORDER == CblasRowMajor) order = BlasRowMajor;
if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans) trans = BlasNoTrans;
if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans;
rows = &crows;
cols = &ccols;
alpha = &calpha;
lda = &clda;
ldb = &cldb;
#endif
if ( order == BlasColMajor) if ( order == BlasColMajor)
{ {

View File

@ -47,6 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BlasNoTrans 0 #define BlasNoTrans 0
#define BlasTrans 1 #define BlasTrans 1
#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb) void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb)
{ {
@ -66,7 +67,27 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'R' ) trans = BlasNoTrans;
if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'T' ) trans = BlasTrans;
if ( Trans == 'C' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans;
#else
void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, FLOAT *b, blasint cldb)
{
blasint *rows, *cols, *lda, *ldb;
FLOAT *alpha;
int order=-1,trans=-1;
blasint info = -1;
if ( CORDER == CblasColMajor ) order = BlasColMajor;
if ( CORDER == CblasRowMajor ) order = BlasRowMajor;
if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans ) trans = BlasNoTrans;
if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans;
rows = &crows;
cols = &ccols;
lda = &clda;
ldb = &cldb;
alpha = &calpha;
#endif
if ( order == BlasColMajor) if ( order == BlasColMajor)
{ {
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;

View File

@ -238,7 +238,7 @@ void CNAME(enum CBLAS_ORDER order,
int nthreads_avail = nthreads_max; int nthreads_avail = nthreads_max;
double MNK = (double) m * (double) n; double MNK = (double) m * (double) n;
if ( MNK <= (80.0 * 20.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) ))
nthreads_max = 1; nthreads_max = 1;
if ( nthreads_max > nthreads_avail ) if ( nthreads_max > nthreads_avail )

View File

@ -49,6 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BlasTransConj 2 #define BlasTransConj 2
#define BlasConj 3 #define BlasConj 3
#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb)
{ {
@ -71,6 +73,30 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'C' ) trans = BlasTransConj; if ( Trans == 'C' ) trans = BlasTransConj;
if ( Trans == 'R' ) trans = BlasConj; if ( Trans == 'R' ) trans = BlasConj;
#else
void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, blasint cldb)
{
blasint *rows, *cols, *lda, *ldb;
int order=-1,trans=-1;
blasint info = -1;
FLOAT *b;
size_t msize;
if ( CORDER == CblasColMajor ) order = BlasColMajor;
if ( CORDER == CblasRowMajor ) order = BlasRowMajor;
if ( CTRANS == CblasNoTrans) trans = BlasNoTrans;
if ( CTRANS == CblasConjNoTrans ) trans = BlasConj;
if ( CTRANS == CblasTrans) trans = BlasTrans;
if ( CTRANS == CblasConjTrans) trans = BlasTransConj;
rows = &crows;
cols = &ccols;
lda = &clda;
ldb = &cldb;
#endif
if ( order == BlasColMajor) if ( order == BlasColMajor)
{ {
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;

View File

@ -49,6 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BlasTransConj 2 #define BlasTransConj 2
#define BlasConj 3 #define BlasConj 3
#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb) void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb)
{ {
@ -69,6 +70,26 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'C' ) trans = BlasTransConj; if ( Trans == 'C' ) trans = BlasTransConj;
if ( Trans == 'R' ) trans = BlasConj; if ( Trans == 'R' ) trans = BlasConj;
#else
void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, FLOAT*b, blasint cldb)
{
blasint *rows, *cols, *lda, *ldb;
int order=-1,trans=-1;
blasint info = -1;
if ( CORDER == CblasColMajor ) order = BlasColMajor;
if ( CORDER == CblasRowMajor ) order = BlasRowMajor;
if ( CTRANS == CblasNoTrans) trans = BlasNoTrans;
if ( CTRANS == CblasConjNoTrans ) trans = BlasConj;
if ( CTRANS == CblasTrans) trans = BlasTrans;
if ( CTRANS == CblasConjTrans) trans = BlasTransConj;
rows = &crows;
cols = &ccols;
lda = &clda;
ldb = &cldb;
#endif
if ( order == BlasColMajor) if ( order == BlasColMajor)
{ {
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;

70
kernel/arm/symv_L.c Normal file
View File

@ -0,0 +1,70 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
#if 0
if ( m != offset )
printf("Symv_L: m=%d offset=%d\n",m,offset);
#endif
jx = 0;
jy = 0;
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
y[jy] += temp1 * a[j*lda+j];
iy = jy;
ix = jx;
for (i=j+1; i<m; i++)
{
ix += inc_x;
iy += inc_y;
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
}
y[jy] += alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}

71
kernel/arm/symv_U.c Normal file
View File

@ -0,0 +1,71 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
#if 0
if( m != offset )
printf("Symv_U: m=%d offset=%d\n",m,offset);
#endif
BLASLONG m1 = m - offset;
jx = m1 * inc_x;
jy = m1 * inc_y;
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
iy = 0;
ix = 0;
for (i=0; i<j; i++)
{
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
ix += inc_x;
iy += inc_y;
}
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}

View File

@ -293,6 +293,14 @@ gotoblas_t TABLE_NAME = {
#endif #endif
chemm_outcopyTS, chemm_oltcopyTS, chemm_outcopyTS, chemm_oltcopyTS,
0, 0, 0,
#ifdef CGEMM3M_DEFAULT_UNROLL_M
CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N),
#else
SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
#endif
cgemm3m_kernelTS, cgemm3m_kernelTS,
cgemm3m_incopybTS, cgemm3m_incopyrTS, cgemm3m_incopybTS, cgemm3m_incopyrTS,
@ -391,6 +399,14 @@ gotoblas_t TABLE_NAME = {
#endif #endif
zhemm_outcopyTS, zhemm_oltcopyTS, zhemm_outcopyTS, zhemm_oltcopyTS,
0, 0, 0,
#ifdef ZGEMM3M_DEFAULT_UNROLL_M
ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N),
#else
DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
#endif
zgemm3m_kernelTS, zgemm3m_kernelTS,
zgemm3m_incopybTS, zgemm3m_incopyrTS, zgemm3m_incopybTS, zgemm3m_incopyrTS,
@ -486,6 +502,9 @@ gotoblas_t TABLE_NAME = {
#endif #endif
xhemm_outcopyTS, xhemm_oltcopyTS, xhemm_outcopyTS, xhemm_oltcopyTS,
0, 0, 0,
QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
xgemm3m_kernelTS, xgemm3m_kernelTS,
xgemm3m_incopybTS, xgemm3m_incopyrTS, xgemm3m_incopybTS, xgemm3m_incopyrTS,
@ -661,9 +680,23 @@ static void init_parameter(void) {
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
#ifdef CGEMM3M_DEFAULT_Q
TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
#else
TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q;
#endif
#ifdef ZGEMM3M_DEFAULT_Q
TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
#else
TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q;
#endif
#ifdef EXPRECISION #ifdef EXPRECISION
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q;
#endif #endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
@ -918,20 +951,56 @@ static void init_parameter(void) {
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION #ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif #endif
#endif #endif
#ifdef CGEMM3M_DEFAULT_P
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
#else
TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
#endif
#ifdef ZGEMM3M_DEFAULT_P
TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
#else
TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
#endif
#ifdef EXPRECISION
TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
#endif
TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1);
#ifdef CGEMM3M_DEFAULT_UNROLL_M
TABLE_NAME.cgemm3m_p = (TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1) & ~(CGEMM3M_DEFAULT_UNROLL_M - 1);
#else
TABLE_NAME.cgemm3m_p = (TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1);
#endif
#ifdef ZGEMM3M_DEFAULT_UNROLL_M
TABLE_NAME.zgemm3m_p = (TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1) & ~(ZGEMM3M_DEFAULT_UNROLL_M - 1);
#else
TABLE_NAME.zgemm3m_p = (TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1);
#endif
#ifdef QUAD_PRECISION #ifdef QUAD_PRECISION
TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.xgemm3m_p = (TABLE_NAME.xgemm3m_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1);
#endif #endif
#ifdef DEBUG #ifdef DEBUG
@ -965,11 +1034,32 @@ static void init_parameter(void) {
+ TABLE_NAME.align) & ~TABLE_NAME.align) + TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE -
((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15);
TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE -
((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15);
#ifdef EXPRECISION #ifdef EXPRECISION
TABLE_NAME.xgemm_r = (((BUFFER_SIZE - TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align) + TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15); ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);
TABLE_NAME.xgemm3m_r = (((BUFFER_SIZE -
((TABLE_NAME.xgemm3m_p * TABLE_NAME.xgemm3m_q * 32 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.xgemm3m_q * 32) - 15) & ~15);
#endif #endif
} }

View File

@ -1,8 +1,20 @@
SGEMVNKERNEL = sgemv_n.c DAXPYKERNEL = daxpy.c
SGEMVTKERNEL = sgemv_t.c CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S

View File

@ -1,14 +1,14 @@
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n.c DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t.c DGEMVTKERNEL = dgemv_t_4.c
ZGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c
CGEMVNKERNEL = cgemv_n.c CGEMVNKERNEL = cgemv_n_4.c
CGEMVTKERNEL = cgemv_t.c CGEMVTKERNEL = cgemv_t_4.c
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c

View File

@ -1,5 +1,17 @@
SGEMVNKERNEL = sgemv_n.c SAXPYKERNEL = saxpy.c
SGEMVTKERNEL = sgemv_t.c DAXPYKERNEL = daxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S SGEMMINCOPY = gemm_ncopy_4.S

View File

@ -1,11 +1,12 @@
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S

View File

@ -1,7 +1,7 @@
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMKERNEL = sgemm_kernel_16x4_sandy.S

131
kernel/x86_64/caxpy.c Normal file
View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c"
#endif
#ifndef HAVE_KERNEL_8
static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
BLASLONG register ix = 0;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
#endif
ix+=4 ;
i+=2 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT da[2];
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
if ( n1 )
{
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_8(n1, x, y , &da );
ix = 2 * n1;
}
i = n1;
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
i++ ;
ix += 2;
}
return(0);
}
inc_x *=2;
inc_y *=2;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,135 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 768(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
"prefetcht0 768(%3,%0,4) \n\t"
#if !defined(CONJ)
"vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t"
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t"
"vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t"
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t"
"vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t"
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t"
"vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t"
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm10, %%xmm10 \n\t"
"vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t"
"vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t"
"vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t"
"vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t"
#else
"vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t"
"vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t"
"vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t"
"vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t"
#endif
"vmovups %%xmm12, (%3,%0,4) \n\t"
"vmovups %%xmm13, 16(%3,%0,4) \n\t"
"vmovups %%xmm14, 32(%3,%0,4) \n\t"
"vmovups %%xmm15, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $16*SIZE, AO addq $ 16*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $8*SIZE, AO addq $ 8*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $4*SIZE, AO addq $ 4*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $2*SIZE, AO addq $ 2*SIZE, AO
decq %rax decq %rax
.endm .endm

View File

@ -1,255 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#if defined(HASWELL)
#include "cgemv_n_microk_haswell-2.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_16x4
static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
y[i] += a1[i]*x[2] - a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
y[i] += a2[i]*x[4] - a2[i+1] * x[5];
y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
y[i] += a3[i]*x[6] - a3[i+1] * x[7];
y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
y[i] += a1[i]*x[2] + a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
y[i] += a2[i]*x[4] + a2[i+1] * x[5];
y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
y[i] += a3[i]*x[6] + a3[i+1] * x[7];
y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
#endif
}
}
#endif
static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
#endif
}
}
static void zero_y(BLASLONG n, FLOAT *dest)
{
BLASLONG i;
for ( i=0; i<2*n; i++ )
{
*dest = 0.0;
dest++;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
{
BLASLONG i;
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest+1) += temp_i;
src+=2;
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG n2;
FLOAT xbuffer[8],*ybuffer;
#if 0
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
#endif
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
n1 = n / 4 ;
n2 = n % 4 ;
m1 = m - ( m % 16 );
m2 = (m % NBMAX) - (m % 16) ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
zero_y(NB,ybuffer);
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
a_ptr += 4 * lda;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
}
add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
j=0;
while ( j < (m % 16))
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
y_ptr += inc_y;
a+=2;
j++;
}
return(0);
}

623
kernel/x86_64/cgemv_n_4.c Normal file
View File

@ -0,0 +1,623 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#if defined(HASWELL)
#include "cgemv_n_microk_haswell-4.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_4x4
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
y[i] += a1[i]*x[2] - a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
y[i] += a2[i]*x[4] - a2[i+1] * x[5];
y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
y[i] += a3[i]*x[6] - a3[i+1] * x[7];
y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
y[i] += a1[i]*x[2] + a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
y[i] += a2[i]*x[4] + a2[i+1] * x[5];
y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
y[i] += a3[i]*x[6] + a3[i+1] * x[7];
y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
#endif
}
}
#endif
#ifndef HAVE_KERNEL_4x2
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
y[i] += a1[i]*x[2] - a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
y[i] += a1[i]*x[2] + a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
#endif
}
}
#endif
#ifndef HAVE_KERNEL_4x1
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
#endif
}
}
#endif
#ifndef HAVE_KERNEL_ADDY
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
{
BLASLONG i;
if ( inc_dest != 2 )
{
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest+1) += temp_i;
src+=2;
dest += inc_dest;
}
return;
}
FLOAT temp_r0;
FLOAT temp_i0;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT temp_r2;
FLOAT temp_i2;
FLOAT temp_r3;
FLOAT temp_i3;
for ( i=0; i<n; i+=4 )
{
#if !defined(XCONJ)
temp_r0 = alpha_r * src[0] - alpha_i * src[1];
temp_i0 = alpha_r * src[1] + alpha_i * src[0];
temp_r1 = alpha_r * src[2] - alpha_i * src[3];
temp_i1 = alpha_r * src[3] + alpha_i * src[2];
temp_r2 = alpha_r * src[4] - alpha_i * src[5];
temp_i2 = alpha_r * src[5] + alpha_i * src[4];
temp_r3 = alpha_r * src[6] - alpha_i * src[7];
temp_i3 = alpha_r * src[7] + alpha_i * src[6];
#else
temp_r0 = alpha_r * src[0] + alpha_i * src[1];
temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
temp_r1 = alpha_r * src[2] + alpha_i * src[3];
temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
temp_r2 = alpha_r * src[4] + alpha_i * src[5];
temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
temp_r3 = alpha_r * src[6] + alpha_i * src[7];
temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
#endif
dest[0] += temp_r0;
dest[1] += temp_i0;
dest[2] += temp_r1;
dest[3] += temp_i1;
dest[4] += temp_r2;
dest[5] += temp_i2;
dest[6] += temp_r3;
dest[7] += temp_i3;
src += 8;
dest += 8;
}
return;
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT xbuffer[8],*ybuffer;
#if 0
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
#endif
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
lda4 = 4 * lda;
n1 = n / 4 ;
n2 = n % 4 ;
m3 = m % 4;
m1 = m - ( m % 4 );
m2 = (m % NBMAX) - (m % 4) ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
x_ptr = x;
//zero_y(NB,ybuffer);
memset(ybuffer,0,NB*8);
if ( inc_x == 2 )
{
for( i = 0; i < n1 ; i++)
{
cgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 8;
}
if ( n2 & 2 )
{
cgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if ( n2 & 1 )
{
cgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
x_ptr += 2;
a_ptr += lda;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
}
}
add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
if ( m3 == 0 ) return(0);
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 )
{
for( i=0 ; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif
a_ptr += 4;
x_ptr += 4;
}
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += 2;
x_ptr += 2;
}
}
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 )
{
for( i = 0; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
#endif
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += 4;
x_ptr += 2;
}
}
else
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
return(0);
}
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 )
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += 6;
x_ptr += 2;
}
}
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
return(0);
}
return(0);
}

View File

@ -1,137 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16x4 1
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
"vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
"vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2
"vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2
"vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
"prefetcht0 320(%5,%0,4) \n\t"
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
"vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%6,%0,4) \n\t"
"vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
"vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2
"vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%7,%0,4) \n\t"
"vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
"vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3
"vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%3,%0,4) \n\t"
"vmovups (%3,%0,4), %%ymm10 \n\t"
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13, 32(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,542 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
BLASLONG register n1 = n & -8 ;
BLASLONG register n2 = n & 4 ;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
"vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
"vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2
"vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2
"vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
"prefetcht0 320(%5,%0,4) \n\t"
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
"vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%6,%0,4) \n\t"
"vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
"vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2
"vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%7,%0,4) \n\t"
"vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
"vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3
"vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%3,%0,4) \n\t"
"vmovups (%3,%0,4), %%ymm10 \n\t"
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13, 32(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"cmpq $4, %8 \n\t"
"jne .L02END%= \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
"vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
"vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmovups (%3,%0,4), %%ymm10 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (n2) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x2 1
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
BLASLONG register n1 = n & -8 ;
BLASLONG register n2 = n & 4 ;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
"vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
"prefetcht0 320(%5,%0,4) \n\t"
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
"vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%3,%0,4) \n\t"
"vmovups (%3,%0,4), %%ymm10 \n\t"
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13, 32(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmovups (%3,%0,4), %%ymm10 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (n2) // 6
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x1 1
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
BLASLONG register n1 = n & -8 ;
BLASLONG register n2 = n & 4 ;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"prefetcht0 320(%3,%0,4) \n\t"
"vmovups (%3,%0,4), %%ymm10 \n\t"
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
#endif
"addq $16, %0 \n\t"
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
"subq $8 , %1 \n\t"
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"cmpq $4, %5 \n\t"
"jne .L02END%= \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmovups (%3,%0,4), %%ymm10 \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
"r" (n2) // 5
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_ADDY 1
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
{
BLASLONG i;
if ( inc_dest != 2 )
{
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest+1) += temp_i;
src+=2;
dest += inc_dest;
}
return;
}
i=0;
BLASLONG register n1 = n & -8 ;
BLASLONG register n2 = n & 4 ;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%ymm0 \n\t" // alpha_r
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
"vmovups (%3,%0,4), %%ymm10 \n\t" // 4 complex values from dest
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
#if !defined(XCONJ)
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
#endif
"addq $16, %0 \n\t"
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
"subq $8 , %1 \n\t"
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
"vmovups (%3,%0,4), %%ymm10 \n\t"
#if !defined(XCONJ)
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
#else
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
#endif
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n1), // 1
"r" (src), // 2
"r" (dest), // 3
"r" (&alpha_r), // 4
"r" (&alpha_i), // 5
"r" (n2) // 6
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
return;
}

View File

@ -1,265 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(HASWELL)
#include "cgemv_t_microk_haswell-2.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_16x4
static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_r3 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_i2 = 0.0;
FLOAT temp_i3 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
#endif
}
y[0] = temp_r0;
y[1] = temp_i0;
y[2] = temp_r1;
y[3] = temp_i1;
y[4] = temp_r2;
y[5] = temp_i2;
y[6] = temp_r3;
y[7] = temp_i3;
}
#endif
static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];
#else
temp_r += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];
#endif
}
*y = temp_r;
*(y+1) = temp_i;
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
*(dest+1) = *(src+1);
dest+=2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[8];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG n2;
FLOAT ybuffer[8],*xbuffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
xbuffer = buffer;
n1 = n / 4 ;
n2 = n % 4 ;
m1 = m - ( m % 16 );
m2 = (m % NBMAX) - (m % 16) ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
copy_x(NB,x_ptr,xbuffer,inc_x);
for( i = 0; i < n1 ; i++)
{
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
a_ptr += 4 * lda;
#if !defined(XCONJ)
y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
y_ptr += inc_y;
y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
y_ptr += inc_y;
y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
y_ptr += inc_y;
y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
y_ptr += inc_y;
#else
y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
y_ptr += inc_y;
y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
y_ptr += inc_y;
y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
y_ptr += inc_y;
y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
y_ptr += inc_y;
#endif
}
for( i = 0; i < n2 ; i++)
{
cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
#if !defined(XCONJ)
y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
y_ptr += inc_y;
#else
y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
y_ptr += inc_y;
#endif
}
a += 2* NB;
x += NB * inc_x;
}
BLASLONG m3 = m % 16;
if ( m3 == 0 ) return(0);
x_ptr = x;
copy_x(m3,x_ptr,xbuffer,inc_x);
j=0;
a_ptr = a;
y_ptr = y;
while ( j < n)
{
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
for( i = 0; i < m3*2; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i] * xbuffer[i] - a_ptr[i+1] * xbuffer[i+1];
temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
#else
temp_r += a_ptr[i] * xbuffer[i] + a_ptr[i+1] * xbuffer[i+1];
temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
#endif
}
a_ptr += lda;
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
y_ptr += inc_y;
j++;
}
return(0);
}

579
kernel/x86_64/cgemv_t_4.c Normal file
View File

@ -0,0 +1,579 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(HASWELL)
#include "cgemv_t_microk_haswell-4.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_4x4
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT alpha_r = alpha[0];
FLOAT alpha_i = alpha[1];
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_r3 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_i2 = 0.0;
FLOAT temp_i3 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
#endif
}
#endif
#ifndef HAVE_KERNEL_4x2
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
FLOAT alpha_r = alpha[0];
FLOAT alpha_i = alpha[1];
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
}
#endif
#ifndef HAVE_KERNEL_4x1
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
FLOAT alpha_r = alpha[0];
FLOAT alpha_i = alpha[1];
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
#endif
}
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
*(dest+1) = *(src+1);
dest+=2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[8];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT ybuffer[8],*xbuffer;
FLOAT alpha[2];
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
inc_x <<= 1;
inc_y <<= 1;
lda <<= 1;
lda4 = lda << 2;
xbuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m - m3;
m2 = (m & (NBMAX-1)) - m3 ;
alpha[0] = alpha_r;
alpha[1] = alpha_i;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_x != 2 )
copy_x(NB,x_ptr,xbuffer,inc_x);
else
xbuffer = x_ptr;
if ( inc_y == 2 )
{
for( i = 0; i < n1 ; i++)
{
cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr += 8;
}
if ( n2 & 2 )
{
cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
a_ptr += lda * 2;
y_ptr += 4;
}
if ( n2 & 1 )
{
cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
a_ptr += lda;
y_ptr += 2;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
memset(ybuffer,0,32);
cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
y_ptr[0] += ybuffer[2];
y_ptr[1] += ybuffer[3];
y_ptr += inc_y;
y_ptr[0] += ybuffer[4];
y_ptr[1] += ybuffer[5];
y_ptr += inc_y;
y_ptr[0] += ybuffer[6];
y_ptr[1] += ybuffer[7];
y_ptr += inc_y;
}
for( i = 0; i < n2 ; i++)
{
memset(ybuffer,0,32);
cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
a_ptr += lda;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
}
}
a += 2 * NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
j=0;
a_ptr = a;
y_ptr = y;
if ( m3 == 3 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
if ( m3 == 2 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT temp_r1 ;
FLOAT temp_i1 ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j+=2;
}
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
if ( m3 == 1 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT temp_r1 ;
FLOAT temp_i1 ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j+=2;
}
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
return(0);
}

View File

@ -1,171 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary froms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary from must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16x4 1
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
"vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
"vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"prefetcht0 192(%2,%0,4) \n\t"
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"prefetcht0 192(%6,%0,4) \n\t"
"vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
"prefetcht0 192(%7,%0,4) \n\t"
"vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 2 complex values from a0
"vmovups 32(%5,%0,4), %%ymm5 \n\t" // 2 complex values from a1
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vmovups 32(%6,%0,4), %%ymm6 \n\t" // 2 complex values from a2
"vmovups 32(%7,%0,4), %%ymm7 \n\t" // 2 complex values from a3
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
"vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t"
#else
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
"vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
#endif
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
"vextractf128 $1, %%ymm10, %%xmm11 \n\t"
"vextractf128 $1, %%ymm12, %%xmm13 \n\t"
"vextractf128 $1, %%ymm14, %%xmm15 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
"vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
"vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t"
"vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
"vmovsd %%xmm8 , (%3) \n\t"
"vmovsd %%xmm10, 8(%3) \n\t"
"vmovsd %%xmm12, 16(%3) \n\t"
"vmovsd %%xmm14, 24(%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,539 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary froms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary from must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
"vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
"vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
"vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"prefetcht0 192(%2,%0,4) \n\t"
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"prefetcht0 192(%6,%0,4) \n\t"
"vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
"prefetcht0 192(%7,%0,4) \n\t"
"vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
"vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L08END%=: \n\t"
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
"vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
"vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t"
"vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t"
#else
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
"vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
"vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t"
"vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
#endif
"vmovsd (%3), %%xmm4 \n\t" // read y
"vmovsd 8(%3), %%xmm5 \n\t"
"vmovsd 16(%3), %%xmm6 \n\t"
"vmovsd 24(%3), %%xmm7 \n\t"
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
"vextractf128 $1, %%ymm10, %%xmm11 \n\t"
"vextractf128 $1, %%ymm12, %%xmm13 \n\t"
"vextractf128 $1, %%ymm14, %%xmm15 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
"vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
"vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t"
"vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
"vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
"vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
"vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r
"vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r
#if !defined(XCONJ)
"vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
"vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t"
"vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
"vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
"vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
"vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t"
"vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t"
"vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t"
#else
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t"
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
"vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t"
"vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t"
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
#endif
"vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t"
"vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t"
"vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t"
"vmovsd %%xmm8 , (%3) \n\t"
"vmovsd %%xmm10, 8(%3) \n\t"
"vmovsd %%xmm12, 16(%3) \n\t"
"vmovsd %%xmm14, 24(%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x2 1
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"prefetcht0 192(%2,%0,4) \n\t"
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L08END%=: \n\t"
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
"vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
"vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
#else
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
"vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
#endif
"vmovsd (%3), %%xmm4 \n\t" // read y
"vmovsd 8(%3), %%xmm5 \n\t"
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
"vextractf128 $1, %%ymm10, %%xmm11 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
"vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
"vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
#if !defined(XCONJ)
"vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
"vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t"
"vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
"vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t"
#else
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
"vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
#endif
"vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
"vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t"
"vmovsd %%xmm8 , (%3) \n\t"
"vmovsd %%xmm10, 8(%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x1 1
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%2,%0,4) \n\t"
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L08END%=: \n\t"
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
#else
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
#endif
"vmovsd (%3), %%xmm4 \n\t" // read y
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
"vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
#if !defined(XCONJ)
"vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
"vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
#else
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
"vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
#endif
"vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
"vmovsd %%xmm8 , (%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
"r" (alpha) // 5
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

105
kernel/x86_64/daxpy.c Normal file
View File

@ -0,0 +1,105 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER)
#include "daxpy_microk_bulldozer-2.c"
#endif
#ifndef HAVE_KERNEL_8
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
while(i < n)
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
if ( n1 )
daxpy_kernel_8(n1, x, y , &da );
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return(0);
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,82 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vmovddup (%4), %%xmm0 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 768(%3,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
".align 2 \n\t"
"vmovups %%xmm8 , (%3,%0,8) \n\t"
"vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x
".align 2 \n\t"
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"vmovups %%xmm9 , 16(%3,%0,8) \n\t"
"prefetcht0 768(%2,%0,8) \n\t"
".align 2 \n\t"
"vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x
"vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"vmovups %%xmm10, 32(%3,%0,8) \n\t"
"vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x
"vmovups %%xmm11, 48(%3,%0,8) \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,91 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%4), %%xmm0 \n\t" // alpha
"shufpd $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t"
// "prefetcht0 192(%3,%0,8) \n\t"
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
"movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
"mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x
"mulpd %%xmm0 , %%xmm13 \n\t"
"mulpd %%xmm0 , %%xmm14 \n\t"
"mulpd %%xmm0 , %%xmm15 \n\t"
"addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x
"addpd %%xmm13, %%xmm9 \n\t"
"addpd %%xmm14, %%xmm10 \n\t"
"addpd %%xmm15, %%xmm11 \n\t"
"movups %%xmm8 , (%3,%0,8) \n\t"
"movups %%xmm9 , 16(%3,%0,8) \n\t"
"movups %%xmm10, 32(%3,%0,8) \n\t"
"movups %%xmm11, 48(%3,%0,8) \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

110
kernel/x86_64/ddot.c Normal file
View File

@ -0,0 +1,110 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "ddot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_8
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
if ( n1 )
ddot_kernel_8(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -25,47 +25,45 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#define HAVE_KERNEL_16x4 1 #define HAVE_KERNEL_8 1
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
__asm__ __volatile__ __asm__ __volatile__
( (
"vzeroupper \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0 "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t" ".L01LOOP%=: \n\t"
"vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"prefetcht0 192(%4,%0,4) \n\t" "vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" "vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y
"prefetcht0 192(%5,%0,4) \n\t" "vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y "addq $8 , %0 \n\t"
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"addq $16, %0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
"subq $16, %1 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
"jnz .L01LOOP%= \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
"vzeroupper \n\t"
"vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
"vmovsd %%xmm4, (%4) \n\t"
: :
: :
@ -73,12 +71,10 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (n), // 1 "r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (ap[0]), // 4 "r" (dot) // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -0,0 +1,94 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
"xorpd %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
"movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
"movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
"mulpd %%xmm8 , %%xmm12 \n\t"
"mulpd %%xmm9 , %%xmm13 \n\t"
"mulpd %%xmm10, %%xmm14 \n\t"
"mulpd %%xmm11, %%xmm15 \n\t"
"addpd %%xmm12, %%xmm4 \n\t"
"addpd %%xmm13, %%xmm5 \n\t"
"addpd %%xmm14, %%xmm6 \n\t"
"addpd %%xmm15, %%xmm7 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"addpd %%xmm5, %%xmm4 \n\t"
"addpd %%xmm7, %%xmm6 \n\t"
"addpd %%xmm6, %%xmm4 \n\t"
"haddpd %%xmm4, %%xmm4 \n\t"
"movsd %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,206 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(HASWELL)
#include "dgemv_n_microk_haswell-2.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_16x4
static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0];
y[i+1] += a0[i+1]*x[0];
y[i+2] += a0[i+2]*x[0];
y[i+3] += a0[i+3]*x[0];
}
}
static void zero_y(BLASLONG n, FLOAT *dest)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = 0.0;
dest++;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest == 1 )
{
for ( i=0; i<n; i+=4 )
{
dest[i] += src[i];
dest[i+1] += src[i+1];
dest[i+2] += src[i+2];
dest[i+3] += src[i+3];
}
}
else
{
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG n2;
FLOAT xbuffer[4],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
n1 = n / 4 ;
n2 = n % 4 ;
m1 = m - ( m % 16 );
m2 = (m % NBMAX) - (m % 16) ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
zero_y(NB,ybuffer);
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = alpha * x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = alpha * x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = alpha * x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = alpha * x_ptr[0];
x_ptr += inc_x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
a_ptr += 4 * lda;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = alpha * x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
}
add_y(NB,ybuffer,y_ptr,inc_y);
a += NB;
y_ptr += NB * inc_y;
}
j=0;
while ( j < (m % 16))
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
y_ptr[0] += alpha * temp;
y_ptr += inc_y;
a++;
j++;
}
return(0);
}

548
kernel/x86_64/dgemv_n_4.c Normal file
View File

@ -0,0 +1,548 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL)
#include "dgemv_n_microk_haswell-4.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_4x8
static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT *b0,*b1,*b2,*b3;
FLOAT *x4;
FLOAT x[8];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x4 = x + 4;
for ( i=0; i<8; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x4
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x2
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2) , %%xmm12 \n\t" // x0
"movsd (%6) , %%xmm4 \n\t" // alpha
"movsd 8(%2) , %%xmm13 \n\t" // x1
"mulsd %%xmm4 , %%xmm12 \n\t" // alpha
"mulsd %%xmm4 , %%xmm13 \n\t" // alpha
"shufpd $0, %%xmm12, %%xmm12 \n\t"
"shufpd $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
"movups (%4,%0,8), %%xmm8 \n\t"
"movups (%5,%0,8), %%xmm9 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm4 \n\t"
"movups 16(%4,%0,8), %%xmm8 \n\t"
"movups 16(%5,%0,8), %%xmm9 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"addpd %%xmm8 , %%xmm5 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
"movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
#ifndef HAVE_KERNEL_4x2
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2), %%xmm12 \n\t" // x0
"mulsd (%5), %%xmm12 \n\t" // alpha
"shufpd $0, %%xmm12, %%xmm12 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm12, %%xmm9 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
"movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
"r" (alpha) // 5
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;
}
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if ( n2 & 4 )
{
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $8 , %8 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L8END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L8END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,265 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2), %%xmm12 \n\t" // x0
"movsd 8(%2), %%xmm13 \n\t" // x1
"movsd 16(%2), %%xmm14 \n\t" // x2
"movsd 24(%2), %%xmm15 \n\t" // x3
"shufpd $0, %%xmm12, %%xmm12\n\t"
"shufpd $0, %%xmm13, %%xmm13\n\t"
"shufpd $0, %%xmm14, %%xmm14\n\t"
"shufpd $0, %%xmm15, %%xmm15\n\t"
"movsd 32(%2), %%xmm0 \n\t" // x4
"movsd 40(%2), %%xmm1 \n\t" // x5
"movsd 48(%2), %%xmm2 \n\t" // x6
"movsd 56(%2), %%xmm3 \n\t" // x7
"shufpd $0, %%xmm0 , %%xmm0 \n\t"
"shufpd $0, %%xmm1 , %%xmm1 \n\t"
"shufpd $0, %%xmm2 , %%xmm2 \n\t"
"shufpd $0, %%xmm3 , %%xmm3 \n\t"
"movsd (%9), %%xmm6 \n\t" // alpha
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
".align 2 \n\t"
"movups (%4,%0,8), %%xmm8 \n\t"
"movups (%5,%0,8), %%xmm9 \n\t"
"movups (%6,%0,8), %%xmm10 \n\t"
"movups (%7,%0,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,8), %%xmm8 \n\t"
"movups (%5,%8,8), %%xmm9 \n\t"
"movups (%6,%8,8), %%xmm10 \n\t"
"movups (%7,%8,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm0 , %%xmm8 \n\t"
"mulpd %%xmm1 , %%xmm9 \n\t"
"mulpd %%xmm2 , %%xmm10 \n\t"
"mulpd %%xmm3 , %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"addpd %%xmm5 , %%xmm4 \n\t"
"mulpd %%xmm6 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
".align 2 \n\t"
"movups 16(%4,%0,8), %%xmm8 \n\t"
"movups 16(%5,%0,8), %%xmm9 \n\t"
"movups 16(%6,%0,8), %%xmm10 \n\t"
"movups 16(%7,%0,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"movups 16(%4,%8,8), %%xmm8 \n\t"
"movups 16(%5,%8,8), %%xmm9 \n\t"
"movups 16(%6,%8,8), %%xmm10 \n\t"
"movups 16(%7,%8,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm0 , %%xmm8 \n\t"
"mulpd %%xmm1 , %%xmm9 \n\t"
"mulpd %%xmm2 , %%xmm10 \n\t"
"mulpd %%xmm3 , %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"addpd %%xmm5 , %%xmm4 \n\t"
"mulpd %%xmm6 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2), %%xmm12 \n\t" // x0
"movsd 8(%2), %%xmm13 \n\t" // x1
"movsd 16(%2), %%xmm14 \n\t" // x2
"movsd 24(%2), %%xmm15 \n\t" // x3
"shufpd $0, %%xmm12, %%xmm12\n\t"
"shufpd $0, %%xmm13, %%xmm13\n\t"
"shufpd $0, %%xmm14, %%xmm14\n\t"
"shufpd $0, %%xmm15, %%xmm15\n\t"
"movsd (%8), %%xmm6 \n\t" // alpha
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
"movups (%4,%0,8), %%xmm8 \n\t"
"movups (%5,%0,8), %%xmm9 \n\t"
"movups (%6,%0,8), %%xmm10 \n\t"
"movups (%7,%0,8), %%xmm11 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm4 \n\t"
"addpd %%xmm10 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t"
"addpd %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, (%3,%0,8) \n\t" // 2 * y
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
"movups 16(%4,%0,8), %%xmm8 \n\t"
"movups 16(%5,%0,8), %%xmm9 \n\t"
"movups 16(%6,%0,8), %%xmm10 \n\t"
"movups 16(%7,%0,8), %%xmm11 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm4 \n\t"
"addpd %%xmm10 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t"
"addpd %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,191 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(HASWELL)
#include "dgemv_t_microk_haswell-2.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_16x4
static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
FLOAT temp = 0.0;
for ( i=0; i< n; i+=4 )
{
temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
}
*y = temp;
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
n1 = n / 4 ;
n2 = n % 4 ;
m1 = m - ( m % 16 );
m2 = (m % NBMAX) - (m % 16) ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
copy_x(NB,x_ptr,xbuffer,inc_x);
for( i = 0; i < n1 ; i++)
{
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
a_ptr += 4 * lda;
*y_ptr += ybuffer[0]*alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1]*alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[2]*alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[3]*alpha;
y_ptr += inc_y;
}
for( i = 0; i < n2 ; i++)
{
dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
*y_ptr += ybuffer[0]*alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
BLASLONG m3 = m % 16;
if ( m3 == 0 ) return(0);
x_ptr = x;
for ( i=0; i< m3; i++ )
{
xbuffer[i] = *x_ptr;
x_ptr += inc_x;
}
j=0;
a_ptr = a;
y_ptr = y;
while ( j < n)
{
FLOAT temp = 0.0;
for( i = 0; i < m3; i++ )
{
temp += a_ptr[i] * xbuffer[i];
}
a_ptr += lda;
y_ptr[0] += alpha * temp;
y_ptr += inc_y;
j++;
}
return(0);
}

615
kernel/x86_64/dgemv_t_4.c Normal file
View File

@ -0,0 +1,615 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(HASWELL)
#include "dgemv_t_microk_haswell-4.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_4x4
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorpd %%xmm10 , %%xmm10 \n\t"
"xorpd %%xmm11 , %%xmm11 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
"movups (%4,%0,8) , %%xmm13 \n\t" // ap1
"mulpd %%xmm14 , %%xmm12 \n\t"
"mulpd %%xmm14 , %%xmm13 \n\t"
"addq $2 , %0 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
"movups (%4,%0,8) , %%xmm13 \n\t" // ap1
"mulpd %%xmm14 , %%xmm12 \n\t"
"mulpd %%xmm14 , %%xmm13 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
"movups 16(%5,%0,8) , %%xmm14 \n\t" // x
"movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0
"movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1
"mulpd %%xmm14 , %%xmm12 \n\t"
"mulpd %%xmm14 , %%xmm13 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
"haddpd %%xmm11, %%xmm11 \n\t"
"movsd %%xmm10, (%2) \n\t"
"movsd %%xmm11,8(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
"r" (x) // 5
: "cc",
"%xmm4", "%xmm5", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorpd %%xmm9 , %%xmm9 \n\t"
"xorpd %%xmm10 , %%xmm10 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
"mulpd %%xmm11 , %%xmm12 \n\t"
"addq $2 , %0 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups 16(%3,%0,8) , %%xmm14 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
"movups 16(%4,%0,8) , %%xmm13 \n\t"
"mulpd %%xmm11 , %%xmm12 \n\t"
"mulpd %%xmm13 , %%xmm14 \n\t"
"addq $4 , %0 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
"addpd %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"addpd %%xmm9 , %%xmm10 \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
"movsd %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
: "cc",
"%xmm9", "%xmm10" ,
"%xmm11", "%xmm12", "%xmm13", "%xmm14",
"memory"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
i=0;
__asm__ __volatile__
(
"movsd (%2) , %%xmm10 \n\t"
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
"mulpd %%xmm10 , %%xmm12 \n\t"
"addq $2 , %0 \n\t"
"addpd %%xmm12 , %%xmm11 \n\t"
"subq $2 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
: "cc",
"%xmm10", "%xmm11", "%xmm12",
"memory"
);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + NBMAX;
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

View File

@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#define HAVE_KERNEL_16x4 1 #define HAVE_KERNEL_4x4 1
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
@ -41,29 +41,49 @@ static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t" ".L01LOOP%=: \n\t"
"prefetcht0 384(%2,%0,8) \n\t" // "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
"prefetcht0 384(%4,%0,8) \n\t" // "prefetcht0 384(%4,%0,8) \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
"prefetcht0 384(%5,%0,8) \n\t" // "prefetcht0 384(%5,%0,8) \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"prefetcht0 384(%6,%0,8) \n\t"
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
"prefetcht0 384(%7,%0,8) \n\t" // "prefetcht0 384(%6,%0,8) \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t" "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t" "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"addq $8 , %0 \n\t"
// "prefetcht0 384(%7,%0,8) \n\t"
"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
"subq $8 , %1 \n\t"
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t" "jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t" "vextractf128 $1 , %%ymm6, %%xmm14 \n\t"

299
kernel/x86_64/dsymv_L.c Normal file
View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "dsymv_L_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_4x4
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
{
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
BLASLONG i;
for (i=from; i<to; i+=4)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
y[i+1] += tmp1[0] * ap[0][i+1];
tmp2[0] += ap[0][i+1] * x[i+1];
y[i+1] += tmp1[1] * ap[1][i+1];
tmp2[1] += ap[1][i+1] * x[i+1];
y[i+1] += tmp1[2] * ap[2][i+1];
tmp2[2] += ap[2][i+1] * x[i+1];
y[i+1] += tmp1[3] * ap[3][i+1];
tmp2[3] += ap[3][i+1] * x[i+1];
y[i+2] += tmp1[0] * ap[0][i+2];
tmp2[0] += ap[0][i+2] * x[i+2];
y[i+2] += tmp1[1] * ap[1][i+2];
tmp2[1] += ap[1][i+2] * x[i+2];
y[i+2] += tmp1[2] * ap[2][i+2];
tmp2[2] += ap[2][i+2] * x[i+2];
y[i+2] += tmp1[3] * ap[3][i+2];
tmp2[3] += ap[3][i+2] * x[i+2];
y[i+3] += tmp1[0] * ap[0][i+3];
tmp2[0] += ap[0][i+3] * x[i+3];
y[i+3] += tmp1[1] * ap[1][i+3];
tmp2[1] += ap[1][i+3] * x[i+3];
y[i+3] += tmp1[2] * ap[2][i+3];
tmp2[2] += ap[2][i+3] * x[i+3];
y[i+3] += tmp1[3] * ap[3][i+3];
tmp2[3] += ap[3][i+3] * x[i+3];
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *ap[4];
#if 0
if ( m != offset )
printf("Symv_L: m=%d offset=%d\n",m,offset);
#endif
if ( (inc_x != 1) || (inc_y != 1) )
{
jx = 0;
jy = 0;
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
y[jy] += temp1 * a[j*lda+j];
iy = jy;
ix = jx;
for (i=j+1; i<m; i++)
{
ix += inc_x;
iy += inc_y;
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
}
y[jy] += alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}
BLASLONG offset1 = (offset/4)*4;
for (j=0; j<offset1; j+=4)
{
tmp1[0] = alpha * x[j];
tmp1[1] = alpha * x[j+1];
tmp1[2] = alpha * x[j+2];
tmp1[3] = alpha * x[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
ap[0] = &a[j*lda];
ap[1] = ap[0] + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
y[j] += tmp1[0] * ap[0][j];
y[j+1] += tmp1[1] * ap[1][j+1];
y[j+2] += tmp1[2] * ap[2][j+2];
y[j+3] += tmp1[3] * ap[3][j+3];
BLASLONG from = j+1;
if ( m - from >=12 )
{
BLASLONG m2 = (m/4)*4;
for (i=j+1; i<j+4; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
}
for (i=j+2; i<j+4; i++)
{
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
}
for (i=j+3; i<j+4; i++)
{
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
}
if ( m2 > j+4 )
dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
for (i=m2; i<m; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
}
}
else
{
for (i=j+1; i<j+4; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
}
for (i=j+2; i<j+4; i++)
{
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
}
for (i=j+3; i<j+4; i++)
{
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
}
for (i=j+4; i<m; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
}
}
y[j] += alpha * tmp2[0];
y[j+1] += alpha * tmp2[1];
y[j+2] += alpha * tmp2[2];
y[j+3] += alpha * tmp2[3];
}
for (j=offset1; j<offset; j++)
{
temp1 = alpha * x[j];
temp2 = 0.0;
y[j] += temp1 * a[j*lda+j];
BLASLONG from = j+1;
if ( m - from >=8 )
{
BLASLONG j1 = ((from + 4)/4)*4;
BLASLONG j2 = (m/4)*4;
for (i=from; i<j1; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
for (i=j1; i<j2; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
for (i=j2; i<m; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
}
else
{
for (i=from; i<m; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
}
y[j] += alpha * temp2;
}
return(0);
}

View File

@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__asm__ __volatile__
(
"vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
"vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
"vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
"vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
"vmovddup (%8), %%xmm4 \n\t" // temp1[0]
"vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
"vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
"vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
"vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
"vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
"vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
"vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
"addq $4 , %0 \n\t"
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vmovsd (%9), %%xmm4 \n\t"
"vmovsd 8(%9), %%xmm5 \n\t"
"vmovsd 16(%9), %%xmm6 \n\t"
"vmovsd 24(%9), %%xmm7 \n\t"
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a[0]), // 4
"r" (a[1]), // 5
"r" (a[2]), // 6
"r" (a[3]), // 8
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,132 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__asm__ __volatile__
(
"xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
"xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
"xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
"xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
"movsd (%8), %%xmm4 \n\t" // temp1[0]
"movsd 8(%8), %%xmm5 \n\t" // temp1[1]
"movsd 16(%8), %%xmm6 \n\t" // temp1[2]
"movsd 24(%8), %%xmm7 \n\t" // temp1[3]
"shufpd $0, %%xmm4, %%xmm4 \n\t"
"shufpd $0, %%xmm5, %%xmm5 \n\t"
"shufpd $0, %%xmm6, %%xmm6 \n\t"
"shufpd $0, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
"movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm12 \n\t" // a * x
"addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"movups %%xmm13 , %%xmm11 \n\t"
"mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm13 \n\t" // a * x
"addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"movups %%xmm14 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm14 \n\t" // a * x
"addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"addq $2 , %0 \n\t"
"movups %%xmm15 , %%xmm11 \n\t"
"mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm15 \n\t" // a * x
"addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"movsd (%9), %%xmm4 \n\t" // temp1[0]
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
"movsd 16(%9), %%xmm6 \n\t" // temp1[2]
"movsd 24(%9), %%xmm7 \n\t" // temp1[3]
"haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t"
"haddpd %%xmm2, %%xmm2 \n\t"
"haddpd %%xmm3, %%xmm3 \n\t"
"addsd %%xmm4, %%xmm0 \n\t"
"addsd %%xmm5, %%xmm1 \n\t"
"addsd %%xmm6, %%xmm2 \n\t"
"addsd %%xmm7, %%xmm3 \n\t"
"movsd %%xmm0 , (%9) \n\t" // save temp2
"movsd %%xmm1 , 8(%9) \n\t" // save temp2
"movsd %%xmm2 , 16(%9) \n\t" // save temp2
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a[0]), // 4
"r" (a[1]), // 5
"r" (a[2]), // 6
"r" (a[3]), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

273
kernel/x86_64/dsymv_U.c Normal file
View File

@ -0,0 +1,273 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "dsymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "dsymv_U_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_4x4
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT x;
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
FLOAT tp0;
FLOAT tp1;
FLOAT tp2;
FLOAT tp3;
BLASLONG i;
tp0 = temp1[0];
tp1 = temp1[1];
tp2 = temp1[2];
tp3 = temp1[3];
for (i=0; i<n; i++)
{
at0 = a0[i];
at1 = a1[i];
at2 = a2[i];
at3 = a3[i];
x = xp[i];
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
tmp2[0] += at0 * x;
tmp2[1] += at1 * x;
tmp2[2] += at2 * x;
tmp2[3] += at3 * x;
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
#ifndef HAVE_KERNEL_1x4
static void dsymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT x;
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
FLOAT tp0;
FLOAT tp1;
FLOAT tp2;
FLOAT tp3;
BLASLONG i;
tp0 = temp1[0];
tp1 = temp1[1];
tp2 = temp1[2];
tp3 = temp1[3];
for (i=from; i<to; i++)
{
at0 = a0[i];
at1 = a1[i];
at2 = a2[i];
at3 = a3[i];
x = xp[i];
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
tmp2[0] += at0 * x;
tmp2[1] += at1 * x;
tmp2[2] += at2 * x;
tmp2[3] += at3 * x;
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
static void dsymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT temp = 0.0;
FLOAT t1 = *temp1;
BLASLONG i;
for (i=0; i<(n/4)*4; i+=4)
{
at0 = a0[i];
at1 = a0[i+1];
at2 = a0[i+2];
at3 = a0[i+3];
yp[i] += t1 * at0;
temp += at0 * xp[i];
yp[i+1] += t1 * at1;
temp += at1 * xp[i+1];
yp[i+2] += t1 * at2;
temp += at2 * xp[i+2];
yp[i+3] += t1 * at3;
temp += at3 * xp[i+3];
}
*temp2 = temp;
}
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
BLASLONG j1;
BLASLONG j2;
BLASLONG m2;
FLOAT temp1;
FLOAT temp2;
FLOAT *xp, *yp;
FLOAT *a0,*a1,*a2,*a3;
FLOAT at0,at1,at2,at3;
FLOAT tmp1[4];
FLOAT tmp2[4];
#if 0
if( m != offset )
printf("Symv_U: m=%d offset=%d\n",m,offset);
#endif
BLASLONG m1 = m - offset;
BLASLONG mrange = m -m1;
if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
{
jx = m1 * inc_x;
jy = m1 * inc_y;
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
iy = 0;
ix = 0;
for (i=0; i<j; i++)
{
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
ix += inc_x;
iy += inc_y;
}
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}
xp = x;
yp = y;
m2 = m - ( mrange % 4 );
for (j=m1; j<m2; j+=4)
{
tmp1[0] = alpha * xp[j];
tmp1[1] = alpha * xp[j+1];
tmp1[2] = alpha * xp[j+2];
tmp1[3] = alpha * xp[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
a0 = &a[j*lda];
a1 = a0+lda;
a2 = a1+lda;
a3 = a2+lda;
j1 = (j/8)*8;
if ( j1 )
dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
if ( j1 < j )
dsymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
j2 = 0;
for ( j1 = j ; j1 < j+4 ; j1++ )
{
temp1 = tmp1[j2];
temp2 = tmp2[j2];
a0 = &a[j1*lda];
for ( i=j ; i<j1; i++ )
{
yp[i] += temp1 * a0[i];
temp2 += a0[i] * xp[i];
}
y[j1] += temp1 * a0[j1] + alpha * temp2;
j2++;
}
}
for ( ; j<m; j++)
{
temp1 = alpha * xp[j];
temp2 = 0.0;
a0 = &a[j*lda];
FLOAT at0;
j1 = (j/8)*8;
if ( j1 )
dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
for (i=j1 ; i<j; i++)
{
at0 = a0[i];
yp[i] += temp1 * at0;
temp2 += at0 * xp[i];
}
yp[j] += temp1 * a0[j] + alpha * temp2;
}
return(0);
}

View File

@ -0,0 +1,130 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
"vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
"vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
"vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
"vmovddup (%8), %%xmm4 \n\t" // temp1[0]
"vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
"vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
"vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
"vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
"vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
"vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
"vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"addq $4 , %0 \n\t"
"vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
"subq $4 , %1 \n\t"
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
"r" (a1), // 5
"r" (a2), // 6
"r" (a3), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,125 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
"xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
"xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
"xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
"movsd (%8), %%xmm4 \n\t" // temp1[0]
"movsd 8(%8), %%xmm5 \n\t" // temp1[1]
"movsd 16(%8), %%xmm6 \n\t" // temp1[2]
"movsd 24(%8), %%xmm7 \n\t" // temp1[3]
"shufpd $0, %%xmm4, %%xmm4 \n\t"
"shufpd $0, %%xmm5, %%xmm5 \n\t"
"shufpd $0, %%xmm6, %%xmm6 \n\t"
"shufpd $0, %%xmm7, %%xmm7 \n\t"
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
"movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm12 \n\t" // a * x
"addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"movups %%xmm13 , %%xmm11 \n\t"
"mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm13 \n\t" // a * x
"addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"movups %%xmm14 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm14 \n\t" // a * x
"addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"addq $2 , %0 \n\t"
"movups %%xmm15 , %%xmm11 \n\t"
"mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm15 \n\t" // a * x
"addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"subq $2 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t"
"haddpd %%xmm2, %%xmm2 \n\t"
"haddpd %%xmm3, %%xmm3 \n\t"
"movsd %%xmm0 , (%9) \n\t" // save temp2
"movsd %%xmm1 , 8(%9) \n\t" // save temp2
"movsd %%xmm2 , 16(%9) \n\t" // save temp2
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
"r" (a1), // 5
"r" (a2), // 6
"r" (a3), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

103
kernel/x86_64/saxpy.c Normal file
View File

@ -0,0 +1,103 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "saxpy_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_16
static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
while(i < n)
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -16;
if ( n1 )
saxpy_kernel_16(n1, x, y , &da );
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return(0);
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,91 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%4), %%xmm0 \n\t" // alpha
"shufps $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 192(%2,%0,4) \n\t"
// "prefetcht0 192(%3,%0,4) \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y
"movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y
"movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y
"mulps %%xmm0 , %%xmm12 \n\t" // alpha * x
"mulps %%xmm0 , %%xmm13 \n\t"
"mulps %%xmm0 , %%xmm14 \n\t"
"mulps %%xmm0 , %%xmm15 \n\t"
"addps %%xmm12, %%xmm8 \n\t" // y += alpha *x
"addps %%xmm13, %%xmm9 \n\t"
"addps %%xmm14, %%xmm10 \n\t"
"addps %%xmm15, %%xmm11 \n\t"
"movups %%xmm8 , (%3,%0,4) \n\t"
"movups %%xmm9 , 16(%3,%0,4) \n\t"
"movups %%xmm10, 32(%3,%0,4) \n\t"
"movups %%xmm11, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

109
kernel/x86_64/sdot.c Normal file
View File

@ -0,0 +1,109 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sdot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_16
static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -16;
if ( n1 )
sdot_kernel_16(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -25,48 +25,46 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#define HAVE_KERNEL_16x4 1 #define HAVE_KERNEL_16 1
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
__asm__ __volatile__ __asm__ __volatile__
( (
"vzeroupper \n\t" "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t" ".L01LOOP%=: \n\t"
"prefetcht0 192(%3,%0,8) \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups (%3,%0,8), %%ymm4 \n\t" // 4 * y "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vmovups 32(%3,%0,8), %%ymm5 \n\t" // 4 * y "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"prefetcht0 192(%4,%0,8) \n\t" "vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" "vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y
"prefetcht0 192(%5,%0,8) \n\t" "vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,8) \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"prefetcht0 192(%7,%0,8) \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vmovups %%ymm4, (%3,%0,8) \n\t" // 4 * y "addq $16, %0 \n\t"
"vmovups %%ymm5, 32(%3,%0,8) \n\t" // 4 * y "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"addq $8 , %0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"subq $8 , %1 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
"jnz .L01LOOP%= \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
"vzeroupper \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vmovss %%xmm4, (%4) \n\t"
: :
: :
@ -74,12 +72,10 @@ static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (n), // 1 "r" (n), // 1
"r" (x), // 2 "r" (x), // 2
"r" (y), // 3 "r" (y), // 3
"r" (ap[0]), // 4 "r" (dot) // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc", : "cc",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );

View File

@ -0,0 +1,94 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorps %%xmm4, %%xmm4 \n\t"
"xorps %%xmm5, %%xmm5 \n\t"
"xorps %%xmm6, %%xmm6 \n\t"
"xorps %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x
"movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x
"movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x
"mulps %%xmm8 , %%xmm12 \n\t"
"mulps %%xmm9 , %%xmm13 \n\t"
"mulps %%xmm10, %%xmm14 \n\t"
"mulps %%xmm11, %%xmm15 \n\t"
"addps %%xmm12, %%xmm4 \n\t"
"addps %%xmm13, %%xmm5 \n\t"
"addps %%xmm14, %%xmm6 \n\t"
"addps %%xmm15, %%xmm7 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"addps %%xmm5, %%xmm4 \n\t"
"addps %%xmm7, %%xmm6 \n\t"
"addps %%xmm6, %%xmm4 \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"movss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) VFMADD231PS_( %ymm15,%ymm3,%ymm1 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $16*SIZE, AO addq $ 16*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $8*SIZE, AO addq $ 8*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) VFMADD231PS_( %xmm12,%xmm2,%xmm0 )
VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) VFMADD231PS_( %xmm14,%xmm3,%xmm0 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $4*SIZE, AO addq $ 4*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) VFMADD231SS_( %xmm15,%xmm3,%xmm1 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $2*SIZE, AO addq $ 2*SIZE, AO
decq %rax decq %rax
.endm .endm
@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
addq $6*SIZE, BO addq $ 6*SIZE, BO
addq $1*SIZE, AO addq $ 1*SIZE, AO
decq %rax decq %rax
.endm .endm

View File

@ -29,17 +29,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_n_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "sgemv_n_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_n_microk_sandy-2.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-2.c"
#endif
#define NBMAX 4096 #define NBMAX 4096
#ifndef HAVE_KERNEL_16x4 #ifndef HAVE_KERNEL_16x4

591
kernel/x86_64/sgemv_n_4.c Normal file
View File

@ -0,0 +1,591 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_n_microk_sandy-4.c"
#elif defined(HASWELL)
#include "sgemv_n_microk_haswell-4.c"
#endif
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x8
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT *b0,*b1,*b2,*b3;
FLOAT *x4;
FLOAT x[8];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x4 = x + 4;
for ( i=0; i<8; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x4
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x2
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2) , %%xmm12 \n\t" // x0
"movss (%6) , %%xmm4 \n\t" // alpha
"movss 4(%2) , %%xmm13 \n\t" // x1
"mulss %%xmm4 , %%xmm12 \n\t" // alpha
"mulss %%xmm4 , %%xmm13 \n\t" // alpha
"shufps $0, %%xmm12, %%xmm12 \n\t"
"shufps $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm9 , %%xmm4 \n\t"
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
#ifndef HAVE_KERNEL_4x2
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
BLASLONG register n1 = n & -8 ;
BLASLONG register n2 = n & 4 ;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"mulss (%6), %%xmm12 \n\t" // alpha
"shufps $0, %%xmm12, %%xmm12 \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
"movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm12, %%xmm9 \n\t"
"addps %%xmm4 , %%xmm8 \n\t"
"addps %%xmm5 , %%xmm9 \n\t"
"addq $8 , %0 \n\t"
"movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y
"movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"testq $0x04, %5 \n\t"
"jz .L08LABEL%= \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
"mulps %%xmm12, %%xmm8 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
:
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
"r" (n2), // 5
"r" (alpha) // 6
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
i=0;
__asm__ __volatile__
(
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4) , %%xmm12 \n\t"
"movups (%3,%0,4) , %%xmm11 \n\t"
"addps %%xmm12 , %%xmm11 \n\t"
"addq $4 , %0 \n\t"
"movups %%xmm11, -16(%3,%0,4) \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",
"%xmm10", "%xmm11", "%xmm12",
"memory"
);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;
}
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*4);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if ( n2 & 4 )
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

View File

@ -1,218 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_n_microk_bulldozer.c"
#elif defined(HASWELL)
#include "sgemv_n_microk_haswell.c"
#else
#include "sgemv_n_microk_sandy.c"
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG register m2;
BLASLONG register n2;
FLOAT *xbuffer,*ybuffer;
xbuffer = buffer;
ybuffer = xbuffer + 2048 + 256;
n1 = n / 512 ;
n2 = n % 512 ;
m1 = m / 64;
m2 = m % 64;
y_ptr = y;
x_ptr = x;
for (j=0; j<n1; j++)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(512,x_ptr,xbuffer,inc_x);
a_ptr = a + j * 512 * lda;
y_ptr = y;
for(i = 0; i<m1; i++ )
{
sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(64,ybuffer,y_ptr,inc_y);
y_ptr += 64 * inc_y;
a_ptr += 64;
}
if ( m2 & 32 )
{
sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(32,ybuffer,y_ptr,inc_y);
y_ptr += 32 * inc_y;
a_ptr += 32;
}
if ( m2 & 16 )
{
sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(16,ybuffer,y_ptr,inc_y);
y_ptr += 16 * inc_y;
a_ptr += 16;
}
if ( m2 & 8 )
{
sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(8,ybuffer,y_ptr,inc_y);
y_ptr += 8 * inc_y;
a_ptr += 8;
}
if ( m2 & 4 )
{
sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(4,ybuffer,y_ptr,inc_y);
y_ptr += 4 * inc_y;
a_ptr += 4;
}
if ( m2 & 2 )
{
sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(2,ybuffer,y_ptr,inc_y);
y_ptr += 2 * inc_y;
a_ptr += 2;
}
if ( m2 & 1 )
{
sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(1,ybuffer,y_ptr,inc_y);
}
x_ptr += 512 * inc_x;
}
if ( n2 > 0 )
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(n2,x_ptr,xbuffer,inc_x);
a_ptr = a + n1 * 512 * lda;
y_ptr = y;
for(i = 0; i<m1; i++ )
{
sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(64,ybuffer,y_ptr,inc_y);
y_ptr += 64 * inc_y;
a_ptr += 64;
}
if ( m2 & 32 )
{
sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(32,ybuffer,y_ptr,inc_y);
y_ptr += 32 * inc_y;
a_ptr += 32;
}
if ( m2 & 16 )
{
sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(16,ybuffer,y_ptr,inc_y);
y_ptr += 16 * inc_y;
a_ptr += 16;
}
if ( m2 & 8 )
{
sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(8,ybuffer,y_ptr,inc_y);
y_ptr += 8 * inc_y;
a_ptr += 8;
}
if ( m2 & 4 )
{
sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(4,ybuffer,y_ptr,inc_y);
y_ptr += 4 * inc_y;
a_ptr += 4;
}
if ( m2 & 2 )
{
sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(2,ybuffer,y_ptr,inc_y);
y_ptr += 2 * inc_y;
a_ptr += 2;
}
if ( m2 & 1 )
{
sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(1,ybuffer,y_ptr,inc_y);
}
}
return(0);
}

View File

@ -1,99 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16x4 1
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"vmovups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y
"vmovups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y
"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm6, 32(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm7, 48(%3,%0,4) \n\t" // 4 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"addq $8 , %8 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t"
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
"addq $16, %8 \n\t"
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
"vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,451 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
"prefetcht0 128(%%r8)\n\t" // Prefetch
"vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
"prefetcht0 192(%%r8)\n\t" // Prefetch
"vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero
"vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero
"vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero
"vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
"vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha
"vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha
"vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha
"vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha
"vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha
"vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
);
}
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8 , %8 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $16, %8 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,461 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*2;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
"vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
"vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
"prefetcht0 128(%%r8)\n\t" // Prefetch
"vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
"prefetcht0 192(%%r8)\n\t" // Prefetch
"vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
"vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,144 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16x4 1
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"movups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y
"movups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y
"prefetcht0 192(%4,%0,4) \n\t"
"movups (%4,%0,4), %%xmm8 \n\t"
"movups 16(%4,%0,4), %%xmm9 \n\t"
"movups 32(%4,%0,4), %%xmm10 \n\t"
"movups 48(%4,%0,4), %%xmm11 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"mulps %%xmm12, %%xmm9 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"mulps %%xmm12, %%xmm10 \n\t"
"addps %%xmm10, %%xmm6 \n\t"
"mulps %%xmm12, %%xmm11 \n\t"
"addps %%xmm11, %%xmm7 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"movups (%5,%0,4), %%xmm8 \n\t"
"movups 16(%5,%0,4), %%xmm9 \n\t"
"movups 32(%5,%0,4), %%xmm10 \n\t"
"movups 48(%5,%0,4), %%xmm11 \n\t"
"mulps %%xmm13, %%xmm8 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"mulps %%xmm13, %%xmm10 \n\t"
"addps %%xmm10, %%xmm6 \n\t"
"mulps %%xmm13, %%xmm11 \n\t"
"addps %%xmm11, %%xmm7 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"movups (%6,%0,4), %%xmm8 \n\t"
"movups 16(%6,%0,4), %%xmm9 \n\t"
"movups 32(%6,%0,4), %%xmm10 \n\t"
"movups 48(%6,%0,4), %%xmm11 \n\t"
"mulps %%xmm14, %%xmm8 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"mulps %%xmm14, %%xmm9 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"mulps %%xmm14, %%xmm10 \n\t"
"addps %%xmm10, %%xmm6 \n\t"
"mulps %%xmm14, %%xmm11 \n\t"
"addps %%xmm11, %%xmm7 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"movups (%7,%0,4), %%xmm8 \n\t"
"movups 16(%7,%0,4), %%xmm9 \n\t"
"movups 32(%7,%0,4), %%xmm10 \n\t"
"movups 48(%7,%0,4), %%xmm11 \n\t"
"mulps %%xmm15, %%xmm8 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"mulps %%xmm15, %%xmm9 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"mulps %%xmm15, %%xmm10 \n\t"
"addps %%xmm10, %%xmm6 \n\t"
"mulps %%xmm15, %%xmm11 \n\t"
"addps %%xmm11, %%xmm7 \n\t"
"movups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"movups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"movups %%xmm6, 32(%3,%0,4) \n\t" // 4 * y
"movups %%xmm7, 48(%3,%0,4) \n\t" // 4 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,204 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
"movss 16(%2), %%xmm0 \n\t" // x4
"movss 20(%2), %%xmm1 \n\t" // x5
"movss 24(%2), %%xmm2 \n\t" // x6
"movss 28(%2), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
"shufps $0, %%xmm3 , %%xmm3 \n\t"
"movss (%9), %%xmm6 \n\t" // alpha
"shufps $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
".align 2 \n\t"
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
".align 2 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"mulps %%xmm14, %%xmm10 \n\t"
"mulps %%xmm15, %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,4), %%xmm8 \n\t"
"movups (%5,%8,4), %%xmm9 \n\t"
"movups (%6,%8,4), %%xmm10 \n\t"
"movups (%7,%8,4), %%xmm11 \n\t"
".align 2 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
"mulps %%xmm2 , %%xmm10 \n\t"
"mulps %%xmm3 , %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
"movss (%8), %%xmm6 \n\t" // alpha
"shufps $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"mulps %%xmm14, %%xmm10 \n\t"
"mulps %%xmm15, %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm9 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm10 , %%xmm4 \n\t"
"addps %%xmm4 , %%xmm11 \n\t"
"mulps %%xmm6 , %%xmm11 \n\t"
"addps %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,370 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4, %8 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8, %8 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
"addq $16, %8 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t"
"vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t"
"vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,473 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*2;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"prefetcht0 128(%%r8)\n\t" // Prefetch
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
"prefetcht0 192(%%r8)\n\t" // Prefetch
"vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -28,16 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_t_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "sgemv_t_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-2.c"
#elif defined(NEHALEM)
#include "sgemv_t_microk_nehalem-2.c"
#endif
#define NBMAX 4096 #define NBMAX 4096
#ifndef HAVE_KERNEL_16x4 #ifndef HAVE_KERNEL_16x4

624
kernel/x86_64/sgemv_t_4.c Normal file
View File

@ -0,0 +1,624 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "sgemv_t_microk_nehalem-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c"
#elif defined(HASWELL)
#include "sgemv_t_microk_haswell-4.c"
#endif
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x4
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorps %%xmm10 , %%xmm10 \n\t"
"xorps %%xmm11 , %%xmm11 \n\t"
"testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
"movups (%4,%0,4) , %%xmm13 \n\t" // ap1
"mulps %%xmm14 , %%xmm12 \n\t"
"mulps %%xmm14 , %%xmm13 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
"movups (%4,%0,4) , %%xmm13 \n\t" // ap1
"mulps %%xmm14 , %%xmm12 \n\t"
"mulps %%xmm14 , %%xmm13 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
"movups 16(%5,%0,4) , %%xmm14 \n\t" // x
"movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0
"movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1
"mulps %%xmm14 , %%xmm12 \n\t"
"mulps %%xmm14 , %%xmm13 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm11, %%xmm11 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm11, %%xmm11 \n\t"
"movss %%xmm10, (%2) \n\t"
"movss %%xmm11,4(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
"r" (x) // 5
: "cc",
"%xmm4", "%xmm5", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorps %%xmm9 , %%xmm9 \n\t"
"xorps %%xmm10 , %%xmm10 \n\t"
"testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
"mulps %%xmm11 , %%xmm12 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups 16(%3,%0,4) , %%xmm14 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
"movups 16(%4,%0,4) , %%xmm13 \n\t"
"mulps %%xmm11 , %%xmm12 \n\t"
"mulps %%xmm13 , %%xmm14 \n\t"
"addq $8 , %0 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"subq $8 , %1 \n\t"
"addps %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"addps %%xmm9 , %%xmm10 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"movss %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
: "cc",
"%xmm9", "%xmm10" ,
"%xmm11", "%xmm12", "%xmm13", "%xmm14",
"memory"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
i=0;
__asm__ __volatile__
(
"movss (%2) , %%xmm10 \n\t"
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
"mulps %%xmm10 , %%xmm12 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm12 , %%xmm11 \n\t"
"subq $4 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
: "cc",
"%xmm10", "%xmm11", "%xmm12",
"memory"
);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + NBMAX;
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

View File

@ -1,232 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_t_microk_bulldozer.c"
#elif defined(HASWELL)
#include "sgemv_t_microk_haswell.c"
#else
#include "sgemv_t_microk_sandy.c"
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
{
FLOAT register temp0 = 0.0;
BLASLONG i;
for ( i=0; i<n ; i++)
{
temp0 += a[i] * x[i];
}
temp0 *= alpha ;
*y += temp0;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *a_ptrl;
BLASLONG m1;
BLASLONG register m2;
FLOAT *xbuffer;
xbuffer = buffer;
BLASLONG register Mblock;
m1 = m / 1024 ;
m2 = m % 1024 ;
x_ptr = x;
a_ptr = a;
for (j=0; j<m1; j++)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(1024,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += 1024;
x_ptr += 1024 * inc_x;
}
if ( m2 == 0 ) return(0);
Mblock = 512;
while ( Mblock >= 16 )
{
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
}
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
if ( m2 & Mblock)
{
xbuffer = x_ptr;
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
}
return(0);
}

View File

@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#define HAVE_KERNEL_16x4 1 #define HAVE_KERNEL_4x4 1
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
@ -40,38 +40,76 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t" "testq $0x04, %1 \n\t"
".L01LOOP%=: \n\t" "jz .L08LABEL%= \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"prefetcht0 384(%4,%0,4) \n\t" "prefetcht0 384(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"prefetcht0 384(%5,%0,4) \n\t" "prefetcht0 384(%5,%0,4) \n\t"
".align 2 \n\t"
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
"prefetcht0 384(%6,%0,4) \n\t" "prefetcht0 384(%6,%0,4) \n\t"
".align 2 \n\t"
"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" "vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t"
"vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" "vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"prefetcht0 384(%7,%0,4) \n\t" "prefetcht0 384(%7,%0,4) \n\t"
"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" "vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 48(%5,%0,4), %%xmm15, %%xmm5 \n\t" "addq $16, %0 \n\t"
"vfmaddps %%xmm6, 48(%6,%0,4), %%xmm15, %%xmm6 \n\t" "vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" "vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t"
"subq $16, %1 \n\t"
"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"addq $16, %0 \n\t" "jnz .L01LOOP%= \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"

View File

@ -1,99 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
//n = n / 16;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
"sarq $4, %%rax \n\t" // n = n / 16
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 512(%%rsi) \n\t"
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
"vmovups (%%rsi), %%xmm4 \n\t"
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
"vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
"addq $16*4 , %%rsi \n\t" // increment pointer of a
"addq $16*4 , %%rdi \n\t" // increment pointer of c
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

Some files were not shown because too many files have changed in this diff Show More