Merge pull request #440 from wernsaar/develop
optimizations for leve1 and level2 blas functions
This commit is contained in:
commit
2702323f7d
|
@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double
|
|||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
EXPRECISION = 1
|
||||
CCOMMON_OPT += -DEXPRECISION
|
||||
CCOMMON_OPT += -DEXPRECISION
|
||||
FCOMMON_OPT += -m128bit-long-double
|
||||
endif
|
||||
endif
|
||||
|
@ -350,6 +350,7 @@ ifeq ($(C_COMPILER), INTEL)
|
|||
CCOMMON_OPT += -wd981
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
# ifeq logical or. GCC or LSB
|
||||
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
||||
|
|
|
@ -35,7 +35,10 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto \
|
||||
ssymv.goto dsymv.goto \
|
||||
sdot.goto ddot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
|
||||
chemv.goto zhemv.goto \
|
||||
chemm.goto zhemm.goto \
|
||||
cherk.goto zherk.goto \
|
||||
cher2k.goto zher2k.goto \
|
||||
|
@ -53,7 +56,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
|||
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
|
||||
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
|
||||
sger.acml dger.acml \
|
||||
ssymv.acml dsymv.acml \
|
||||
sdot.acml ddot.acml \
|
||||
saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
|
||||
ssymv.acml dsymv.acml csymv.acml zsymv.acml \
|
||||
chemv.acml zhemv.acml \
|
||||
chemm.acml zhemm.acml \
|
||||
cherk.acml zherk.acml \
|
||||
cher2k.acml zher2k.acml \
|
||||
|
@ -71,7 +77,10 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
|||
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
|
||||
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
|
||||
sger.atlas dger.atlas \
|
||||
ssymv.atlas dsymv.atlas \
|
||||
sdot.atlas ddot.atlas \
|
||||
saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
|
||||
ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
|
||||
chemv.atlas zhemv.atlas \
|
||||
chemm.acml zhemm.acml \
|
||||
chemm.atlas zhemm.atlas \
|
||||
cherk.atlas zherk.atlas \
|
||||
|
@ -90,7 +99,10 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
|||
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
|
||||
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
|
||||
sger.mkl dger.mkl \
|
||||
ssymv.mkl dsymv.mkl \
|
||||
sdot.mkl ddot.mkl \
|
||||
saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
|
||||
ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
|
||||
chemv.mkl zhemv.mkl \
|
||||
chemm.mkl zhemm.mkl \
|
||||
cherk.mkl zherk.mkl \
|
||||
cher2k.mkl zher2k.mkl \
|
||||
|
@ -100,7 +112,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
|||
spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
|
||||
ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
|
||||
|
||||
all :: goto atlas acml mkl
|
||||
all :: goto mkl atlas acml
|
||||
|
||||
##################################### Slinpack ####################################################
|
||||
slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
|
@ -732,6 +744,32 @@ dsymv.atlas : dsymv.$(SUFFIX)
|
|||
dsymv.mkl : dsymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Csymv ####################################################
|
||||
csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
csymv.acml : csymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
csymv.atlas : csymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
csymv.mkl : csymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dsymv ####################################################
|
||||
zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
zsymv.acml : zsymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zsymv.atlas : zsymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zsymv.mkl : zsymv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Sgeev ####################################################
|
||||
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
@ -896,6 +934,115 @@ zpotrf.atlas : zpotrf.$(SUFFIX)
|
|||
zpotrf.mkl : zpotrf.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Chemv ####################################################
|
||||
|
||||
chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
chemv.acml : chemv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
chemv.atlas : chemv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
chemv.mkl : chemv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zhemv ####################################################
|
||||
|
||||
zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
zhemv.acml : zhemv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zhemv.atlas : zhemv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zhemv.mkl : zhemv.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Sdot ####################################################
|
||||
sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
sdot.acml : sdot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
sdot.atlas : sdot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
sdot.mkl : sdot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ddot ####################################################
|
||||
ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
ddot.acml : ddot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ddot.atlas : ddot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ddot.mkl : ddot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Saxpy ####################################################
|
||||
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
saxpy.acml : saxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
saxpy.atlas : saxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
saxpy.mkl : saxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Daxpy ####################################################
|
||||
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
daxpy.acml : daxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
daxpy.atlas : daxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
daxpy.mkl : daxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Caxpy ####################################################
|
||||
|
||||
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
caxpy.acml : caxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
caxpy.atlas : caxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
caxpy.mkl : caxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zaxpy ####################################################
|
||||
|
||||
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||
|
||||
zaxpy.acml : zaxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zaxpy.atlas : zaxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zaxpy.mkl : zaxpy.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
|
||||
###################################################################################################
|
||||
|
||||
|
@ -1037,6 +1184,12 @@ ssymv.$(SUFFIX) : symv.c
|
|||
dsymv.$(SUFFIX) : symv.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
csymv.$(SUFFIX) : symv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zsymv.$(SUFFIX) : symv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
sgeev.$(SUFFIX) : geev.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
|
@ -1073,7 +1226,29 @@ cpotrf.$(SUFFIX) : potrf.c
|
|||
zpotrf.$(SUFFIX) : potrf.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
chemv.$(SUFFIX) : hemv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zhemv.$(SUFFIX) : hemv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
sdot.$(SUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
ddot.$(SUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
saxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
daxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
caxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zaxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,201 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef AXPY
|
||||
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define AXPY BLASFUNC(zaxpy)
|
||||
#else
|
||||
#define AXPY BLASFUNC(caxpy)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define AXPY BLASFUNC(daxpy)
|
||||
#else
|
||||
#define AXPY BLASFUNC(saxpy)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -0,0 +1,195 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef DOT
|
||||
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define DOT BLASFUNC(ddot)
|
||||
#else
|
||||
#define DOT BLASFUNC(sdot)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
FLOAT result;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
result = DOT (&m, x, &inc_x, y, &inc_y );
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){
|
|||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
|
|
|
@ -0,0 +1,208 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef HEMV
|
||||
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define HEMV BLASFUNC(zhemv)
|
||||
#else
|
||||
#define HEMV BLASFUNC(chemv)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
char uplo='L';
|
||||
blasint m, i, j;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -0,0 +1,70 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
BLASLONG j;
|
||||
FLOAT temp1;
|
||||
FLOAT temp2;
|
||||
|
||||
#if 0
|
||||
if ( m != offset )
|
||||
printf("Symv_L: m=%d offset=%d\n",m,offset);
|
||||
#endif
|
||||
|
||||
jx = 0;
|
||||
jy = 0;
|
||||
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
temp2 = 0.0;
|
||||
y[jy] += temp1 * a[j*lda+j];
|
||||
iy = jy;
|
||||
ix = jx;
|
||||
for (i=j+1; i<m; i++)
|
||||
{
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
y[iy] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[ix];
|
||||
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
BLASLONG j;
|
||||
FLOAT temp1;
|
||||
FLOAT temp2;
|
||||
|
||||
#if 0
|
||||
if( m != offset )
|
||||
printf("Symv_U: m=%d offset=%d\n",m,offset);
|
||||
#endif
|
||||
|
||||
BLASLONG m1 = m - offset;
|
||||
|
||||
jx = m1 * inc_x;
|
||||
jy = m1 * inc_y;
|
||||
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
temp2 = 0.0;
|
||||
iy = 0;
|
||||
ix = 0;
|
||||
for (i=0; i<j; i++)
|
||||
{
|
||||
y[iy] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[ix];
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
|
||||
}
|
||||
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -1,3 +1,15 @@
|
|||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
|
||||
DSYMV_U_KERNEL = dsymv_U.c
|
||||
DSYMV_L_KERNEL = dsymv_L.c
|
||||
SSYMV_U_KERNEL = ssymv_U.c
|
||||
SSYMV_L_KERNEL = ssymv_L.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
|
||||
|
|
|
@ -1,5 +1,17 @@
|
|||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
|
||||
DSYMV_U_KERNEL = dsymv_U.c
|
||||
DSYMV_L_KERNEL = dsymv_L.c
|
||||
SSYMV_U_KERNEL = ssymv_U.c
|
||||
SSYMV_L_KERNEL = ssymv_L.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
|
||||
SGEMMINCOPY = gemm_ncopy_4.S
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "caxpy_microk_bulldozer-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register ix = 0;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
|
||||
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
|
||||
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
|
||||
#endif
|
||||
|
||||
ix+=4 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT da[2];
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -8;
|
||||
|
||||
if ( n1 )
|
||||
{
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
caxpy_kernel_8(n1, x, y , &da );
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
i++ ;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
inc_x *=2;
|
||||
inc_y *=2;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
|
||||
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"prefetcht0 768(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
|
||||
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
|
||||
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
|
||||
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
|
||||
"prefetcht0 768(%3,%0,4) \n\t"
|
||||
|
||||
#if !defined(CONJ)
|
||||
"vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
|
||||
"vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t"
|
||||
|
||||
"vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
|
||||
"vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
"vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
|
||||
"vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t"
|
||||
|
||||
"vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
|
||||
"vmulps %%xmm1, %%xmm10, %%xmm10 \n\t"
|
||||
|
||||
"vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t"
|
||||
"vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t"
|
||||
"vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t"
|
||||
"vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t"
|
||||
|
||||
#else
|
||||
|
||||
"vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
|
||||
"vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
|
||||
"vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
|
||||
"vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
|
||||
|
||||
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
|
||||
"vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
|
||||
|
||||
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
|
||||
"vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
|
||||
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
|
||||
"vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
|
||||
|
||||
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
|
||||
"vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
|
||||
|
||||
"vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t"
|
||||
"vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t"
|
||||
"vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t"
|
||||
"vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t"
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
"vmovups %%xmm12, (%3,%0,4) \n\t"
|
||||
"vmovups %%xmm13, 16(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm14, 32(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm15, 48(%3,%0,4) \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 )
|
||||
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $16*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 16*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
|
||||
VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $8*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 8*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
|
||||
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $4*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 4*SIZE, AO
|
||||
decq %rax
|
||||
|
||||
.endm
|
||||
|
@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
|
||||
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $2*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 2*SIZE, AO
|
||||
decq %rax
|
||||
|
||||
.endm
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(NEHALEM)
|
||||
#include "daxpy_microk_nehalem-2.c"
|
||||
#elif defined(BULLDOZER)
|
||||
#include "daxpy_microk_bulldozer-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT a = *alpha;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] += a * x[i];
|
||||
y[i+1] += a * x[i+1];
|
||||
y[i+2] += a * x[i+2];
|
||||
y[i+3] += a * x[i+3];
|
||||
y[i+4] += a * x[i+4];
|
||||
y[i+5] += a * x[i+5];
|
||||
y[i+6] += a * x[i+6];
|
||||
y[i+7] += a * x[i+7];
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -8;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, x, y , &da );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vmovddup (%4), %%xmm0 \n\t" // alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"prefetcht0 768(%3,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x
|
||||
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||
".align 2 \n\t"
|
||||
"vmovups %%xmm8 , (%3,%0,8) \n\t"
|
||||
"vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x
|
||||
".align 2 \n\t"
|
||||
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||
"vmovups %%xmm9 , 16(%3,%0,8) \n\t"
|
||||
"prefetcht0 768(%2,%0,8) \n\t"
|
||||
".align 2 \n\t"
|
||||
"vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x
|
||||
"vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
|
||||
"vmovups %%xmm10, 32(%3,%0,8) \n\t"
|
||||
"vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x
|
||||
"vmovups %%xmm11, 48(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movsd (%4), %%xmm0 \n\t" // alpha
|
||||
"shufpd $0, %%xmm0, %%xmm0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
// "prefetcht0 192(%2,%0,8) \n\t"
|
||||
// "prefetcht0 192(%3,%0,8) \n\t"
|
||||
|
||||
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||
"movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||
"movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
|
||||
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
|
||||
"movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
|
||||
"movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
|
||||
|
||||
"mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x
|
||||
"mulpd %%xmm0 , %%xmm13 \n\t"
|
||||
"mulpd %%xmm0 , %%xmm14 \n\t"
|
||||
"mulpd %%xmm0 , %%xmm15 \n\t"
|
||||
|
||||
"addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x
|
||||
"addpd %%xmm13, %%xmm9 \n\t"
|
||||
"addpd %%xmm14, %%xmm10 \n\t"
|
||||
"addpd %%xmm15, %%xmm11 \n\t"
|
||||
|
||||
"movups %%xmm8 , (%3,%0,8) \n\t"
|
||||
"movups %%xmm9 , 16(%3,%0,8) \n\t"
|
||||
"movups %%xmm10, 32(%3,%0,8) \n\t"
|
||||
"movups %%xmm11, 48(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "ddot_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "ddot_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
*d += dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -8;
|
||||
|
||||
if ( n1 )
|
||||
ddot_kernel_8(n1, x, y , &dot );
|
||||
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||
"vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
|
||||
|
||||
"vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y
|
||||
"vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y
|
||||
"vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y
|
||||
"vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||
|
||||
"vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
|
||||
"vmovsd %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorpd %%xmm4, %%xmm4 \n\t"
|
||||
"xorpd %%xmm5, %%xmm5 \n\t"
|
||||
"xorpd %%xmm6, %%xmm6 \n\t"
|
||||
"xorpd %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
|
||||
"movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||
"movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
|
||||
"movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||
"movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
|
||||
"movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
|
||||
"movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
|
||||
|
||||
"mulpd %%xmm8 , %%xmm12 \n\t"
|
||||
"mulpd %%xmm9 , %%xmm13 \n\t"
|
||||
"mulpd %%xmm10, %%xmm14 \n\t"
|
||||
"mulpd %%xmm11, %%xmm15 \n\t"
|
||||
|
||||
"addpd %%xmm12, %%xmm4 \n\t"
|
||||
"addpd %%xmm13, %%xmm5 \n\t"
|
||||
"addpd %%xmm14, %%xmm6 \n\t"
|
||||
"addpd %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"addpd %%xmm5, %%xmm4 \n\t"
|
||||
"addpd %%xmm7, %%xmm6 \n\t"
|
||||
"addpd %%xmm6, %%xmm4 \n\t"
|
||||
|
||||
"haddpd %%xmm4, %%xmm4 \n\t"
|
||||
|
||||
"movsd %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(HASWELL)
|
||||
#include "dgemv_n_microk_haswell-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "dgemv_n_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16x4 1
|
||||
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movddup (%2), %%xmm12 \n\t" // x0
|
||||
"movddup 8(%2), %%xmm13 \n\t" // x1
|
||||
"movddup 16(%2), %%xmm14 \n\t" // x2
|
||||
"movddup 24(%2), %%xmm15 \n\t" // x3
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"prefetcht0 192(%3,%0,8) \n\t"
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||
"movups 32(%3,%0,8), %%xmm6 \n\t" // 2 * y
|
||||
"movups 48(%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups 32(%4,%0,8), %%xmm10 \n\t" // 2 * a
|
||||
"movups 48(%4,%0,8), %%xmm11 \n\t" // 2 * a
|
||||
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"mulpd %%xmm12 , %%xmm8 \n\t" // a * x
|
||||
"mulpd %%xmm12 , %%xmm9 \n\t" // a * x
|
||||
"mulpd %%xmm12 , %%xmm10 \n\t" // a * x
|
||||
"mulpd %%xmm12 , %%xmm11 \n\t" // a * x
|
||||
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
|
||||
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
|
||||
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
|
||||
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
|
||||
|
||||
"prefetcht0 192(%5,%0,8) \n\t"
|
||||
"movups (%5,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%5,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups 32(%5,%0,8), %%xmm10 \n\t" // 2 * a
|
||||
"movups 48(%5,%0,8), %%xmm11 \n\t" // 2 * a
|
||||
"mulpd %%xmm13 , %%xmm8 \n\t" // a * x
|
||||
"mulpd %%xmm13 , %%xmm9 \n\t" // a * x
|
||||
"mulpd %%xmm13 , %%xmm10 \n\t" // a * x
|
||||
"mulpd %%xmm13 , %%xmm11 \n\t" // a * x
|
||||
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
|
||||
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
|
||||
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
|
||||
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
|
||||
|
||||
"prefetcht0 192(%6,%0,8) \n\t"
|
||||
"movups (%6,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%6,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups 32(%6,%0,8), %%xmm10 \n\t" // 2 * a
|
||||
"movups 48(%6,%0,8), %%xmm11 \n\t" // 2 * a
|
||||
"mulpd %%xmm14 , %%xmm8 \n\t" // a * x
|
||||
"mulpd %%xmm14 , %%xmm9 \n\t" // a * x
|
||||
"mulpd %%xmm14 , %%xmm10 \n\t" // a * x
|
||||
"mulpd %%xmm14 , %%xmm11 \n\t" // a * x
|
||||
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
|
||||
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
|
||||
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
|
||||
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
|
||||
|
||||
"prefetcht0 192(%7,%0,8) \n\t"
|
||||
"movups (%7,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%7,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups 32(%7,%0,8), %%xmm10 \n\t" // 2 * a
|
||||
"movups 48(%7,%0,8), %%xmm11 \n\t" // 2 * a
|
||||
"mulpd %%xmm15 , %%xmm8 \n\t" // a * x
|
||||
"mulpd %%xmm15 , %%xmm9 \n\t" // a * x
|
||||
"mulpd %%xmm15 , %%xmm10 \n\t" // a * x
|
||||
"mulpd %%xmm15 , %%xmm11 \n\t" // a * x
|
||||
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
|
||||
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
|
||||
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
|
||||
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
|
||||
|
||||
"movups %%xmm4, (%3,%0,8) \n\t" // 4 * y
|
||||
"movups %%xmm5, 16(%3,%0,8) \n\t" // 4 * y
|
||||
"movups %%xmm6, 32(%3,%0,8) \n\t" // 4 * y
|
||||
"movups %%xmm7, 48(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,299 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "dsymv_L_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "dsymv_L_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
BLASLONG i;
|
||||
|
||||
for (i=from; i<to; i+=4)
|
||||
{
|
||||
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
y[i] += tmp1[3] * ap[3][i];
|
||||
tmp2[3] += ap[3][i] * x[i];
|
||||
|
||||
y[i+1] += tmp1[0] * ap[0][i+1];
|
||||
tmp2[0] += ap[0][i+1] * x[i+1];
|
||||
y[i+1] += tmp1[1] * ap[1][i+1];
|
||||
tmp2[1] += ap[1][i+1] * x[i+1];
|
||||
y[i+1] += tmp1[2] * ap[2][i+1];
|
||||
tmp2[2] += ap[2][i+1] * x[i+1];
|
||||
y[i+1] += tmp1[3] * ap[3][i+1];
|
||||
tmp2[3] += ap[3][i+1] * x[i+1];
|
||||
|
||||
y[i+2] += tmp1[0] * ap[0][i+2];
|
||||
tmp2[0] += ap[0][i+2] * x[i+2];
|
||||
y[i+2] += tmp1[1] * ap[1][i+2];
|
||||
tmp2[1] += ap[1][i+2] * x[i+2];
|
||||
y[i+2] += tmp1[2] * ap[2][i+2];
|
||||
tmp2[2] += ap[2][i+2] * x[i+2];
|
||||
y[i+2] += tmp1[3] * ap[3][i+2];
|
||||
tmp2[3] += ap[3][i+2] * x[i+2];
|
||||
|
||||
y[i+3] += tmp1[0] * ap[0][i+3];
|
||||
tmp2[0] += ap[0][i+3] * x[i+3];
|
||||
y[i+3] += tmp1[1] * ap[1][i+3];
|
||||
tmp2[1] += ap[1][i+3] * x[i+3];
|
||||
y[i+3] += tmp1[2] * ap[2][i+3];
|
||||
tmp2[2] += ap[2][i+3] * x[i+3];
|
||||
y[i+3] += tmp1[3] * ap[3][i+3];
|
||||
tmp2[3] += ap[3][i+3] * x[i+3];
|
||||
|
||||
}
|
||||
|
||||
temp2[0] += tmp2[0];
|
||||
temp2[1] += tmp2[1];
|
||||
temp2[2] += tmp2[2];
|
||||
temp2[3] += tmp2[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
BLASLONG j;
|
||||
FLOAT temp1;
|
||||
FLOAT temp2;
|
||||
FLOAT tmp1[4];
|
||||
FLOAT tmp2[4];
|
||||
FLOAT *ap[4];
|
||||
|
||||
#if 0
|
||||
if ( m != offset )
|
||||
printf("Symv_L: m=%d offset=%d\n",m,offset);
|
||||
#endif
|
||||
|
||||
|
||||
if ( (inc_x != 1) || (inc_y != 1) )
|
||||
{
|
||||
|
||||
jx = 0;
|
||||
jy = 0;
|
||||
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
temp2 = 0.0;
|
||||
y[jy] += temp1 * a[j*lda+j];
|
||||
iy = jy;
|
||||
ix = jx;
|
||||
for (i=j+1; i<m; i++)
|
||||
{
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
y[iy] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[ix];
|
||||
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
BLASLONG offset1 = (offset/4)*4;
|
||||
|
||||
for (j=0; j<offset1; j+=4)
|
||||
{
|
||||
tmp1[0] = alpha * x[j];
|
||||
tmp1[1] = alpha * x[j+1];
|
||||
tmp1[2] = alpha * x[j+2];
|
||||
tmp1[3] = alpha * x[j+3];
|
||||
tmp2[0] = 0.0;
|
||||
tmp2[1] = 0.0;
|
||||
tmp2[2] = 0.0;
|
||||
tmp2[3] = 0.0;
|
||||
ap[0] = &a[j*lda];
|
||||
ap[1] = ap[0] + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
y[j] += tmp1[0] * ap[0][j];
|
||||
y[j+1] += tmp1[1] * ap[1][j+1];
|
||||
y[j+2] += tmp1[2] * ap[2][j+2];
|
||||
y[j+3] += tmp1[3] * ap[3][j+3];
|
||||
BLASLONG from = j+1;
|
||||
if ( m - from >=12 )
|
||||
{
|
||||
BLASLONG m2 = (m/4)*4;
|
||||
for (i=j+1; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+2; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+3; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
}
|
||||
|
||||
if ( m2 > j+4 )
|
||||
dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
|
||||
|
||||
|
||||
for (i=m2; i<m; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
|
||||
y[i] += tmp1[3] * ap[3][i];
|
||||
tmp2[3] += ap[3][i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for (i=j+1; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+2; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+3; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+4; i<m; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
|
||||
y[i] += tmp1[3] * ap[3][i];
|
||||
tmp2[3] += ap[3][i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y[j] += alpha * tmp2[0];
|
||||
y[j+1] += alpha * tmp2[1];
|
||||
y[j+2] += alpha * tmp2[2];
|
||||
y[j+3] += alpha * tmp2[3];
|
||||
}
|
||||
|
||||
|
||||
for (j=offset1; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
temp2 = 0.0;
|
||||
y[j] += temp1 * a[j*lda+j];
|
||||
BLASLONG from = j+1;
|
||||
if ( m - from >=8 )
|
||||
{
|
||||
BLASLONG j1 = ((from + 4)/4)*4;
|
||||
BLASLONG j2 = (m/4)*4;
|
||||
for (i=from; i<j1; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
for (i=j1; i<j2; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
for (i=j2; i<m; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i=from; i<m; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y[j] += alpha * temp2;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"vmovddup (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
|
||||
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
|
||||
|
||||
"vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
|
||||
|
||||
"vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
|
||||
|
||||
"vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
|
||||
|
||||
"vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
|
||||
"vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
"vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
|
||||
|
||||
"vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
|
||||
"vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
|
||||
|
||||
"vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
|
||||
|
||||
"vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
|
||||
"vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
|
||||
"addq $4 , %0 \n\t"
|
||||
|
||||
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
|
||||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmovsd (%9), %%xmm4 \n\t"
|
||||
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||
"vmovsd 16(%9), %%xmm6 \n\t"
|
||||
"vmovsd 24(%9), %%xmm7 \n\t"
|
||||
|
||||
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
|
||||
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
|
||||
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
|
||||
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
|
||||
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
|
||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a[0]), // 4
|
||||
"r" (a[1]), // 5
|
||||
"r" (a[2]), // 6
|
||||
"r" (a[3]), // 8
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"movsd (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"movsd 8(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"movsd 16(%8), %%xmm6 \n\t" // temp1[2]
|
||||
"movsd 24(%8), %%xmm7 \n\t" // temp1[3]
|
||||
"shufpd $0, %%xmm4, %%xmm4 \n\t"
|
||||
"shufpd $0, %%xmm5, %%xmm5 \n\t"
|
||||
"shufpd $0, %%xmm6, %%xmm6 \n\t"
|
||||
"shufpd $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
"movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
|
||||
"movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
|
||||
|
||||
"mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm12 \n\t" // a * x
|
||||
"addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
|
||||
"movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
|
||||
"movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
|
||||
|
||||
"movups %%xmm13 , %%xmm11 \n\t"
|
||||
"mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm13 \n\t" // a * x
|
||||
"addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm14 , %%xmm11 \n\t"
|
||||
"mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm14 \n\t" // a * x
|
||||
"addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
|
||||
"addq $2 , %0 \n\t"
|
||||
"movups %%xmm15 , %%xmm11 \n\t"
|
||||
"mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm15 \n\t" // a * x
|
||||
"addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"movsd (%9), %%xmm4 \n\t" // temp1[0]
|
||||
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
|
||||
"movsd 16(%9), %%xmm6 \n\t" // temp1[2]
|
||||
"movsd 24(%9), %%xmm7 \n\t" // temp1[3]
|
||||
|
||||
"haddpd %%xmm0, %%xmm0 \n\t"
|
||||
"haddpd %%xmm1, %%xmm1 \n\t"
|
||||
"haddpd %%xmm2, %%xmm2 \n\t"
|
||||
"haddpd %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"addsd %%xmm4, %%xmm0 \n\t"
|
||||
"addsd %%xmm5, %%xmm1 \n\t"
|
||||
"addsd %%xmm6, %%xmm2 \n\t"
|
||||
"addsd %%xmm7, %%xmm3 \n\t"
|
||||
|
||||
"movsd %%xmm0 , (%9) \n\t" // save temp2
|
||||
"movsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||
"movsd %%xmm2 , 16(%9) \n\t" // save temp2
|
||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a[0]), // 4
|
||||
"r" (a[1]), // 5
|
||||
"r" (a[2]), // 6
|
||||
"r" (a[3]), // 7
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,273 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "dsymv_U_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "dsymv_U_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT x;
|
||||
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
FLOAT tp0;
|
||||
FLOAT tp1;
|
||||
FLOAT tp2;
|
||||
FLOAT tp3;
|
||||
BLASLONG i;
|
||||
|
||||
tp0 = temp1[0];
|
||||
tp1 = temp1[1];
|
||||
tp2 = temp1[2];
|
||||
tp3 = temp1[3];
|
||||
|
||||
for (i=0; i<n; i++)
|
||||
{
|
||||
at0 = a0[i];
|
||||
at1 = a1[i];
|
||||
at2 = a2[i];
|
||||
at3 = a3[i];
|
||||
x = xp[i];
|
||||
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
|
||||
tmp2[0] += at0 * x;
|
||||
tmp2[1] += at1 * x;
|
||||
tmp2[2] += at2 * x;
|
||||
tmp2[3] += at3 * x;
|
||||
|
||||
}
|
||||
|
||||
temp2[0] += tmp2[0];
|
||||
temp2[1] += tmp2[1];
|
||||
temp2[2] += tmp2[2];
|
||||
temp2[3] += tmp2[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_1x4
|
||||
|
||||
static void dsymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT x;
|
||||
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
FLOAT tp0;
|
||||
FLOAT tp1;
|
||||
FLOAT tp2;
|
||||
FLOAT tp3;
|
||||
BLASLONG i;
|
||||
|
||||
tp0 = temp1[0];
|
||||
tp1 = temp1[1];
|
||||
tp2 = temp1[2];
|
||||
tp3 = temp1[3];
|
||||
|
||||
for (i=from; i<to; i++)
|
||||
{
|
||||
at0 = a0[i];
|
||||
at1 = a1[i];
|
||||
at2 = a2[i];
|
||||
at3 = a3[i];
|
||||
x = xp[i];
|
||||
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
|
||||
tmp2[0] += at0 * x;
|
||||
tmp2[1] += at1 * x;
|
||||
tmp2[2] += at2 * x;
|
||||
tmp2[3] += at3 * x;
|
||||
|
||||
}
|
||||
|
||||
temp2[0] += tmp2[0];
|
||||
temp2[1] += tmp2[1];
|
||||
temp2[2] += tmp2[2];
|
||||
temp2[3] += tmp2[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void dsymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT temp = 0.0;
|
||||
FLOAT t1 = *temp1;
|
||||
BLASLONG i;
|
||||
|
||||
for (i=0; i<(n/4)*4; i+=4)
|
||||
{
|
||||
at0 = a0[i];
|
||||
at1 = a0[i+1];
|
||||
at2 = a0[i+2];
|
||||
at3 = a0[i+3];
|
||||
|
||||
yp[i] += t1 * at0;
|
||||
temp += at0 * xp[i];
|
||||
yp[i+1] += t1 * at1;
|
||||
temp += at1 * xp[i+1];
|
||||
|
||||
yp[i+2] += t1 * at2;
|
||||
temp += at2 * xp[i+2];
|
||||
yp[i+3] += t1 * at3;
|
||||
temp += at3 * xp[i+3];
|
||||
|
||||
}
|
||||
*temp2 = temp;
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
BLASLONG j;
|
||||
BLASLONG j1;
|
||||
BLASLONG j2;
|
||||
BLASLONG m2;
|
||||
FLOAT temp1;
|
||||
FLOAT temp2;
|
||||
FLOAT *xp, *yp;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT tmp1[4];
|
||||
FLOAT tmp2[4];
|
||||
|
||||
#if 0
|
||||
if( m != offset )
|
||||
printf("Symv_U: m=%d offset=%d\n",m,offset);
|
||||
#endif
|
||||
|
||||
BLASLONG m1 = m - offset;
|
||||
BLASLONG mrange = m -m1;
|
||||
|
||||
if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
|
||||
{
|
||||
|
||||
jx = m1 * inc_x;
|
||||
jy = m1 * inc_y;
|
||||
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
temp2 = 0.0;
|
||||
iy = 0;
|
||||
ix = 0;
|
||||
for (i=0; i<j; i++)
|
||||
{
|
||||
y[iy] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[ix];
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
|
||||
}
|
||||
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
xp = x;
|
||||
yp = y;
|
||||
|
||||
m2 = m - ( mrange % 4 );
|
||||
|
||||
for (j=m1; j<m2; j+=4)
|
||||
{
|
||||
tmp1[0] = alpha * xp[j];
|
||||
tmp1[1] = alpha * xp[j+1];
|
||||
tmp1[2] = alpha * xp[j+2];
|
||||
tmp1[3] = alpha * xp[j+3];
|
||||
tmp2[0] = 0.0;
|
||||
tmp2[1] = 0.0;
|
||||
tmp2[2] = 0.0;
|
||||
tmp2[3] = 0.0;
|
||||
a0 = &a[j*lda];
|
||||
a1 = a0+lda;
|
||||
a2 = a1+lda;
|
||||
a3 = a2+lda;
|
||||
j1 = (j/8)*8;
|
||||
if ( j1 )
|
||||
dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
|
||||
if ( j1 < j )
|
||||
dsymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
|
||||
|
||||
j2 = 0;
|
||||
for ( j1 = j ; j1 < j+4 ; j1++ )
|
||||
{
|
||||
temp1 = tmp1[j2];
|
||||
temp2 = tmp2[j2];
|
||||
a0 = &a[j1*lda];
|
||||
for ( i=j ; i<j1; i++ )
|
||||
{
|
||||
yp[i] += temp1 * a0[i];
|
||||
temp2 += a0[i] * xp[i];
|
||||
|
||||
}
|
||||
y[j1] += temp1 * a0[j1] + alpha * temp2;
|
||||
j2++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for ( ; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * xp[j];
|
||||
temp2 = 0.0;
|
||||
a0 = &a[j*lda];
|
||||
FLOAT at0;
|
||||
j1 = (j/8)*8;
|
||||
|
||||
if ( j1 )
|
||||
dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
|
||||
|
||||
for (i=j1 ; i<j; i++)
|
||||
{
|
||||
at0 = a0[i];
|
||||
yp[i] += temp1 * at0;
|
||||
temp2 += at0 * xp[i];
|
||||
|
||||
}
|
||||
|
||||
yp[j] += temp1 * a0[j] + alpha * temp2;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"vmovddup (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
|
||||
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
|
||||
|
||||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
|
||||
|
||||
"vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
|
||||
|
||||
"vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
|
||||
|
||||
"vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
|
||||
|
||||
"vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
|
||||
"vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
"vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
|
||||
|
||||
"vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
|
||||
"vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
|
||||
|
||||
"vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
|
||||
|
||||
"vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
|
||||
"vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
"addq $4 , %0 \n\t"
|
||||
"vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
"vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
|
||||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
|
||||
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
|
||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
"r" (a1), // 5
|
||||
"r" (a2), // 6
|
||||
"r" (a3), // 7
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"movsd (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"movsd 8(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"movsd 16(%8), %%xmm6 \n\t" // temp1[2]
|
||||
"movsd 24(%8), %%xmm7 \n\t" // temp1[3]
|
||||
"shufpd $0, %%xmm4, %%xmm4 \n\t"
|
||||
"shufpd $0, %%xmm5, %%xmm5 \n\t"
|
||||
"shufpd $0, %%xmm6, %%xmm6 \n\t"
|
||||
"shufpd $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
"movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
|
||||
"movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
|
||||
|
||||
"mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm12 \n\t" // a * x
|
||||
"addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
|
||||
"movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
|
||||
"movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
|
||||
|
||||
"movups %%xmm13 , %%xmm11 \n\t"
|
||||
"mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm13 \n\t" // a * x
|
||||
"addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm14 , %%xmm11 \n\t"
|
||||
"mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm14 \n\t" // a * x
|
||||
"addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
|
||||
"addq $2 , %0 \n\t"
|
||||
"movups %%xmm15 , %%xmm11 \n\t"
|
||||
"mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
|
||||
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulpd %%xmm8 , %%xmm15 \n\t" // a * x
|
||||
"addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"subq $2 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"haddpd %%xmm0, %%xmm0 \n\t"
|
||||
"haddpd %%xmm1, %%xmm1 \n\t"
|
||||
"haddpd %%xmm2, %%xmm2 \n\t"
|
||||
"haddpd %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"movsd %%xmm0 , (%9) \n\t" // save temp2
|
||||
"movsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||
"movsd %%xmm2 , 16(%9) \n\t" // save temp2
|
||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
"r" (a1), // 5
|
||||
"r" (a2), // 6
|
||||
"r" (a3), // 7
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(NEHALEM)
|
||||
#include "saxpy_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT a = *alpha;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] += a * x[i];
|
||||
y[i+1] += a * x[i+1];
|
||||
y[i+2] += a * x[i+2];
|
||||
y[i+3] += a * x[i+3];
|
||||
y[i+4] += a * x[i+4];
|
||||
y[i+5] += a * x[i+5];
|
||||
y[i+6] += a * x[i+6];
|
||||
y[i+7] += a * x[i+7];
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
saxpy_kernel_16(n1, x, y , &da );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%4), %%xmm0 \n\t" // alpha
|
||||
"shufps $0, %%xmm0, %%xmm0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
// "prefetcht0 192(%2,%0,4) \n\t"
|
||||
// "prefetcht0 192(%3,%0,4) \n\t"
|
||||
|
||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||
"movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * y
|
||||
"movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
"movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y
|
||||
"movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y
|
||||
|
||||
"mulps %%xmm0 , %%xmm12 \n\t" // alpha * x
|
||||
"mulps %%xmm0 , %%xmm13 \n\t"
|
||||
"mulps %%xmm0 , %%xmm14 \n\t"
|
||||
"mulps %%xmm0 , %%xmm15 \n\t"
|
||||
|
||||
"addps %%xmm12, %%xmm8 \n\t" // y += alpha *x
|
||||
"addps %%xmm13, %%xmm9 \n\t"
|
||||
"addps %%xmm14, %%xmm10 \n\t"
|
||||
"addps %%xmm15, %%xmm11 \n\t"
|
||||
|
||||
"movups %%xmm8 , (%3,%0,4) \n\t"
|
||||
"movups %%xmm9 , 16(%3,%0,4) \n\t"
|
||||
"movups %%xmm10, 32(%3,%0,4) \n\t"
|
||||
"movups %%xmm11, 48(%3,%0,4) \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,109 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "sdot_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "sdot_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
*d += dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
sdot_kernel_16(n1, x, y , &dot );
|
||||
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||
"vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
|
||||
|
||||
"vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y
|
||||
"vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y
|
||||
"vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y
|
||||
"vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||
"vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
|
||||
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
|
||||
"vmovss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorps %%xmm4, %%xmm4 \n\t"
|
||||
"xorps %%xmm5, %%xmm5 \n\t"
|
||||
"xorps %%xmm6, %%xmm6 \n\t"
|
||||
"xorps %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x
|
||||
"movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||
"movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x
|
||||
"movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
|
||||
"movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x
|
||||
|
||||
"mulps %%xmm8 , %%xmm12 \n\t"
|
||||
"mulps %%xmm9 , %%xmm13 \n\t"
|
||||
"mulps %%xmm10, %%xmm14 \n\t"
|
||||
"mulps %%xmm11, %%xmm15 \n\t"
|
||||
|
||||
"addps %%xmm12, %%xmm4 \n\t"
|
||||
"addps %%xmm13, %%xmm5 \n\t"
|
||||
"addps %%xmm14, %%xmm6 \n\t"
|
||||
"addps %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"addps %%xmm5, %%xmm4 \n\t"
|
||||
"addps %%xmm7, %%xmm6 \n\t"
|
||||
"addps %%xmm6, %%xmm4 \n\t"
|
||||
|
||||
"haddps %%xmm4, %%xmm4 \n\t"
|
||||
"haddps %%xmm4, %%xmm4 \n\t"
|
||||
|
||||
"movss %%xmm4, (%4) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
|
||||
VFMADD231PS_( %ymm15,%ymm3,%ymm1 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $16*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 16*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
|
||||
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $8*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 8*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADD231PS_( %xmm12,%xmm2,%xmm0 )
|
||||
VFMADD231PS_( %xmm14,%xmm3,%xmm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $4*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 4*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
|
||||
VFMADD231SS_( %xmm15,%xmm3,%xmm1 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $2*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 2*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
|
||||
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $1*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 1*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
|
|
@ -0,0 +1,299 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "ssymv_L_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "ssymv_L_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
BLASLONG i;
|
||||
|
||||
for (i=from; i<to; i+=4)
|
||||
{
|
||||
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
y[i] += tmp1[3] * ap[3][i];
|
||||
tmp2[3] += ap[3][i] * x[i];
|
||||
|
||||
y[i+1] += tmp1[0] * ap[0][i+1];
|
||||
tmp2[0] += ap[0][i+1] * x[i+1];
|
||||
y[i+1] += tmp1[1] * ap[1][i+1];
|
||||
tmp2[1] += ap[1][i+1] * x[i+1];
|
||||
y[i+1] += tmp1[2] * ap[2][i+1];
|
||||
tmp2[2] += ap[2][i+1] * x[i+1];
|
||||
y[i+1] += tmp1[3] * ap[3][i+1];
|
||||
tmp2[3] += ap[3][i+1] * x[i+1];
|
||||
|
||||
y[i+2] += tmp1[0] * ap[0][i+2];
|
||||
tmp2[0] += ap[0][i+2] * x[i+2];
|
||||
y[i+2] += tmp1[1] * ap[1][i+2];
|
||||
tmp2[1] += ap[1][i+2] * x[i+2];
|
||||
y[i+2] += tmp1[2] * ap[2][i+2];
|
||||
tmp2[2] += ap[2][i+2] * x[i+2];
|
||||
y[i+2] += tmp1[3] * ap[3][i+2];
|
||||
tmp2[3] += ap[3][i+2] * x[i+2];
|
||||
|
||||
y[i+3] += tmp1[0] * ap[0][i+3];
|
||||
tmp2[0] += ap[0][i+3] * x[i+3];
|
||||
y[i+3] += tmp1[1] * ap[1][i+3];
|
||||
tmp2[1] += ap[1][i+3] * x[i+3];
|
||||
y[i+3] += tmp1[2] * ap[2][i+3];
|
||||
tmp2[2] += ap[2][i+3] * x[i+3];
|
||||
y[i+3] += tmp1[3] * ap[3][i+3];
|
||||
tmp2[3] += ap[3][i+3] * x[i+3];
|
||||
|
||||
}
|
||||
|
||||
temp2[0] += tmp2[0];
|
||||
temp2[1] += tmp2[1];
|
||||
temp2[2] += tmp2[2];
|
||||
temp2[3] += tmp2[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
BLASLONG j;
|
||||
FLOAT temp1;
|
||||
FLOAT temp2;
|
||||
FLOAT tmp1[4];
|
||||
FLOAT tmp2[4];
|
||||
FLOAT *ap[4];
|
||||
|
||||
#if 0
|
||||
if ( m != offset )
|
||||
printf("Symv_L: m=%d offset=%d\n",m,offset);
|
||||
#endif
|
||||
|
||||
|
||||
if ( (inc_x != 1) || (inc_y != 1) )
|
||||
{
|
||||
|
||||
jx = 0;
|
||||
jy = 0;
|
||||
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
temp2 = 0.0;
|
||||
y[jy] += temp1 * a[j*lda+j];
|
||||
iy = jy;
|
||||
ix = jx;
|
||||
for (i=j+1; i<m; i++)
|
||||
{
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
y[iy] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[ix];
|
||||
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
BLASLONG offset1 = (offset/4)*4;
|
||||
|
||||
for (j=0; j<offset1; j+=4)
|
||||
{
|
||||
tmp1[0] = alpha * x[j];
|
||||
tmp1[1] = alpha * x[j+1];
|
||||
tmp1[2] = alpha * x[j+2];
|
||||
tmp1[3] = alpha * x[j+3];
|
||||
tmp2[0] = 0.0;
|
||||
tmp2[1] = 0.0;
|
||||
tmp2[2] = 0.0;
|
||||
tmp2[3] = 0.0;
|
||||
ap[0] = &a[j*lda];
|
||||
ap[1] = ap[0] + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
y[j] += tmp1[0] * ap[0][j];
|
||||
y[j+1] += tmp1[1] * ap[1][j+1];
|
||||
y[j+2] += tmp1[2] * ap[2][j+2];
|
||||
y[j+3] += tmp1[3] * ap[3][j+3];
|
||||
BLASLONG from = j+1;
|
||||
if ( m - from >=12 )
|
||||
{
|
||||
BLASLONG m2 = (m/4)*4;
|
||||
for (i=j+1; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+2; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+3; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
}
|
||||
|
||||
if ( m2 > j+4 )
|
||||
ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
|
||||
|
||||
|
||||
for (i=m2; i<m; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
|
||||
y[i] += tmp1[3] * ap[3][i];
|
||||
tmp2[3] += ap[3][i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for (i=j+1; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+2; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+3; i<j+4; i++)
|
||||
{
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
}
|
||||
|
||||
for (i=j+4; i<m; i++)
|
||||
{
|
||||
y[i] += tmp1[0] * ap[0][i];
|
||||
tmp2[0] += ap[0][i] * x[i];
|
||||
|
||||
y[i] += tmp1[1] * ap[1][i];
|
||||
tmp2[1] += ap[1][i] * x[i];
|
||||
|
||||
y[i] += tmp1[2] * ap[2][i];
|
||||
tmp2[2] += ap[2][i] * x[i];
|
||||
|
||||
y[i] += tmp1[3] * ap[3][i];
|
||||
tmp2[3] += ap[3][i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y[j] += alpha * tmp2[0];
|
||||
y[j+1] += alpha * tmp2[1];
|
||||
y[j+2] += alpha * tmp2[2];
|
||||
y[j+3] += alpha * tmp2[3];
|
||||
}
|
||||
|
||||
|
||||
for (j=offset1; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
temp2 = 0.0;
|
||||
y[j] += temp1 * a[j*lda+j];
|
||||
BLASLONG from = j+1;
|
||||
if ( m - from >=8 )
|
||||
{
|
||||
BLASLONG j1 = ((from + 4)/4)*4;
|
||||
BLASLONG j2 = (m/4)*4;
|
||||
for (i=from; i<j1; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
for (i=j1; i<j2; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
for (i=j2; i<m; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i=from; i<m; i++)
|
||||
{
|
||||
y[i] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[i];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y[j] += alpha * temp2;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"vbroadcastss (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[2]
|
||||
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3]
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
|
||||
"vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y
|
||||
|
||||
"vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a
|
||||
|
||||
"vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
"vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a
|
||||
|
||||
"vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
"vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a
|
||||
|
||||
"vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
"vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
"vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
|
||||
"vmovups %%xmm9 , -16(%3,%0,4) \n\t"
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmovss (%9), %%xmm4 \n\t"
|
||||
"vmovss 4(%9), %%xmm5 \n\t"
|
||||
"vmovss 8(%9), %%xmm6 \n\t"
|
||||
"vmovss 12(%9), %%xmm7 \n\t"
|
||||
|
||||
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"vaddss %%xmm4, %%xmm0, %%xmm0 \n\t"
|
||||
"vaddss %%xmm5, %%xmm1, %%xmm1 \n\t"
|
||||
"vaddss %%xmm6, %%xmm2, %%xmm2 \n\t"
|
||||
"vaddss %%xmm7, %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"vmovss %%xmm0 , (%9) \n\t" // save temp2
|
||||
"vmovss %%xmm1 , 4(%9) \n\t" // save temp2
|
||||
"vmovss %%xmm2 , 8(%9) \n\t" // save temp2
|
||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a[0]), // 4
|
||||
"r" (a[1]), // 5
|
||||
"r" (a[2]), // 6
|
||||
"r" (a[3]), // 8
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorps %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"xorps %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"xorps %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"xorps %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"movss (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"movss 4(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"movss 8(%8), %%xmm6 \n\t" // temp1[2]
|
||||
"movss 12(%8), %%xmm7 \n\t" // temp1[3]
|
||||
"shufps $0, %%xmm4, %%xmm4 \n\t"
|
||||
"shufps $0, %%xmm5, %%xmm5 \n\t"
|
||||
"shufps $0, %%xmm6, %%xmm6 \n\t"
|
||||
"shufps $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
|
||||
"movups (%4,%0,4), %%xmm12 \n\t" // 4 * a
|
||||
"movups (%5,%0,4), %%xmm13 \n\t" // 4 * a
|
||||
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
"mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm12 \n\t" // a * x
|
||||
"addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
|
||||
"movups (%6,%0,4), %%xmm14 \n\t" // 4 * a
|
||||
"movups (%7,%0,4), %%xmm15 \n\t" // 4 * a
|
||||
|
||||
"movups %%xmm13 , %%xmm11 \n\t"
|
||||
"mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm13 \n\t" // a * x
|
||||
"addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm14 , %%xmm11 \n\t"
|
||||
"mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm14 \n\t" // a * x
|
||||
"addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm15 , %%xmm11 \n\t"
|
||||
"mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm15 \n\t" // a * x
|
||||
"addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm9, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"movss (%9), %%xmm4 \n\t" // temp1[0]
|
||||
"movss 4(%9), %%xmm5 \n\t" // temp1[1]
|
||||
"movss 8(%9), %%xmm6 \n\t" // temp1[2]
|
||||
"movss 12(%9), %%xmm7 \n\t" // temp1[3]
|
||||
|
||||
"haddps %%xmm0, %%xmm0 \n\t"
|
||||
"haddps %%xmm1, %%xmm1 \n\t"
|
||||
"haddps %%xmm2, %%xmm2 \n\t"
|
||||
"haddps %%xmm3, %%xmm3 \n\t"
|
||||
"haddps %%xmm0, %%xmm0 \n\t"
|
||||
"haddps %%xmm1, %%xmm1 \n\t"
|
||||
"haddps %%xmm2, %%xmm2 \n\t"
|
||||
"haddps %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"addss %%xmm4, %%xmm0 \n\t"
|
||||
"addss %%xmm5, %%xmm1 \n\t"
|
||||
"addss %%xmm6, %%xmm2 \n\t"
|
||||
"addss %%xmm7, %%xmm3 \n\t"
|
||||
|
||||
"movss %%xmm0 , (%9) \n\t" // save temp2
|
||||
"movss %%xmm1 , 4(%9) \n\t" // save temp2
|
||||
"movss %%xmm2 , 8(%9) \n\t" // save temp2
|
||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (from), // 0
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a[0]), // 4
|
||||
"r" (a[1]), // 5
|
||||
"r" (a[2]), // 6
|
||||
"r" (a[3]), // 7
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,273 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "ssymv_U_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "ssymv_U_microk_nehalem-2.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT x;
|
||||
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
FLOAT tp0;
|
||||
FLOAT tp1;
|
||||
FLOAT tp2;
|
||||
FLOAT tp3;
|
||||
BLASLONG i;
|
||||
|
||||
tp0 = temp1[0];
|
||||
tp1 = temp1[1];
|
||||
tp2 = temp1[2];
|
||||
tp3 = temp1[3];
|
||||
|
||||
for (i=0; i<n; i++)
|
||||
{
|
||||
at0 = a0[i];
|
||||
at1 = a1[i];
|
||||
at2 = a2[i];
|
||||
at3 = a3[i];
|
||||
x = xp[i];
|
||||
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
|
||||
tmp2[0] += at0 * x;
|
||||
tmp2[1] += at1 * x;
|
||||
tmp2[2] += at2 * x;
|
||||
tmp2[3] += at3 * x;
|
||||
|
||||
}
|
||||
|
||||
temp2[0] += tmp2[0];
|
||||
temp2[1] += tmp2[1];
|
||||
temp2[2] += tmp2[2];
|
||||
temp2[3] += tmp2[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_1x4
|
||||
|
||||
static void ssymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT x;
|
||||
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
FLOAT tp0;
|
||||
FLOAT tp1;
|
||||
FLOAT tp2;
|
||||
FLOAT tp3;
|
||||
BLASLONG i;
|
||||
|
||||
tp0 = temp1[0];
|
||||
tp1 = temp1[1];
|
||||
tp2 = temp1[2];
|
||||
tp3 = temp1[3];
|
||||
|
||||
for (i=from; i<to; i++)
|
||||
{
|
||||
at0 = a0[i];
|
||||
at1 = a1[i];
|
||||
at2 = a2[i];
|
||||
at3 = a3[i];
|
||||
x = xp[i];
|
||||
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
|
||||
tmp2[0] += at0 * x;
|
||||
tmp2[1] += at1 * x;
|
||||
tmp2[2] += at2 * x;
|
||||
tmp2[3] += at3 * x;
|
||||
|
||||
}
|
||||
|
||||
temp2[0] += tmp2[0];
|
||||
temp2[1] += tmp2[1];
|
||||
temp2[2] += tmp2[2];
|
||||
temp2[3] += tmp2[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void ssymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT temp = 0.0;
|
||||
FLOAT t1 = *temp1;
|
||||
BLASLONG i;
|
||||
|
||||
for (i=0; i<(n/4)*4; i+=4)
|
||||
{
|
||||
at0 = a0[i];
|
||||
at1 = a0[i+1];
|
||||
at2 = a0[i+2];
|
||||
at3 = a0[i+3];
|
||||
|
||||
yp[i] += t1 * at0;
|
||||
temp += at0 * xp[i];
|
||||
yp[i+1] += t1 * at1;
|
||||
temp += at1 * xp[i+1];
|
||||
|
||||
yp[i+2] += t1 * at2;
|
||||
temp += at2 * xp[i+2];
|
||||
yp[i+3] += t1 * at3;
|
||||
temp += at3 * xp[i+3];
|
||||
|
||||
}
|
||||
*temp2 = temp;
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
BLASLONG j;
|
||||
BLASLONG j1;
|
||||
BLASLONG j2;
|
||||
BLASLONG m2;
|
||||
FLOAT temp1;
|
||||
FLOAT temp2;
|
||||
FLOAT *xp, *yp;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT at0,at1,at2,at3;
|
||||
FLOAT tmp1[4];
|
||||
FLOAT tmp2[4];
|
||||
|
||||
#if 0
|
||||
if( m != offset )
|
||||
printf("Symv_U: m=%d offset=%d\n",m,offset);
|
||||
#endif
|
||||
|
||||
BLASLONG m1 = m - offset;
|
||||
BLASLONG mrange = m -m1;
|
||||
|
||||
if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
|
||||
{
|
||||
|
||||
jx = m1 * inc_x;
|
||||
jy = m1 * inc_y;
|
||||
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
temp2 = 0.0;
|
||||
iy = 0;
|
||||
ix = 0;
|
||||
for (i=0; i<j; i++)
|
||||
{
|
||||
y[iy] += temp1 * a[j*lda+i];
|
||||
temp2 += a[j*lda+i] * x[ix];
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
|
||||
}
|
||||
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
xp = x;
|
||||
yp = y;
|
||||
|
||||
m2 = m - ( mrange % 4 );
|
||||
|
||||
for (j=m1; j<m2; j+=4)
|
||||
{
|
||||
tmp1[0] = alpha * xp[j];
|
||||
tmp1[1] = alpha * xp[j+1];
|
||||
tmp1[2] = alpha * xp[j+2];
|
||||
tmp1[3] = alpha * xp[j+3];
|
||||
tmp2[0] = 0.0;
|
||||
tmp2[1] = 0.0;
|
||||
tmp2[2] = 0.0;
|
||||
tmp2[3] = 0.0;
|
||||
a0 = &a[j*lda];
|
||||
a1 = a0+lda;
|
||||
a2 = a1+lda;
|
||||
a3 = a2+lda;
|
||||
j1 = (j/8)*8;
|
||||
if ( j1 )
|
||||
ssymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
|
||||
if ( j1 < j )
|
||||
ssymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
|
||||
|
||||
j2 = 0;
|
||||
for ( j1 = j ; j1 < j+4 ; j1++ )
|
||||
{
|
||||
temp1 = tmp1[j2];
|
||||
temp2 = tmp2[j2];
|
||||
a0 = &a[j1*lda];
|
||||
for ( i=j ; i<j1; i++ )
|
||||
{
|
||||
yp[i] += temp1 * a0[i];
|
||||
temp2 += a0[i] * xp[i];
|
||||
|
||||
}
|
||||
y[j1] += temp1 * a0[j1] + alpha * temp2;
|
||||
j2++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for ( ; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * xp[j];
|
||||
temp2 = 0.0;
|
||||
a0 = &a[j*lda];
|
||||
FLOAT at0;
|
||||
j1 = (j/8)*8;
|
||||
|
||||
if ( j1 )
|
||||
ssymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
|
||||
|
||||
for (i=j1 ; i<j; i++)
|
||||
{
|
||||
at0 = a0[i];
|
||||
yp[i] += temp1 * at0;
|
||||
temp2 += at0 * xp[i];
|
||||
|
||||
}
|
||||
|
||||
yp[j] += temp1 * a0[j] + alpha * temp2;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"vbroadcastss (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1]
|
||||
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1]
|
||||
|
||||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
|
||||
"vmovups (%4,%0,4), %%xmm12 \n\t" // 4 * a
|
||||
"vmovups (%5,%0,4), %%xmm13 \n\t" // 4 * a
|
||||
|
||||
"vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
"vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
"vmovups (%6,%0,4), %%xmm14 \n\t" // 4 * a
|
||||
"vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
"vmovups (%7,%0,4), %%xmm15 \n\t" // 4 * a
|
||||
"vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
"vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
|
||||
|
||||
"vmovups %%xmm9 , (%3,%0,4) \n\t"
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"vmovss %%xmm0 , (%9) \n\t" // save temp2
|
||||
"vmovss %%xmm1 , 4(%9) \n\t" // save temp2
|
||||
"vmovss %%xmm2 , 8(%9) \n\t" // save temp2
|
||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
"r" (a1), // 5
|
||||
"r" (a2), // 6
|
||||
"r" (a3), // 7
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||
|
||||
static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorps %%xmm0 , %%xmm0 \n\t" // temp2[0]
|
||||
"xorps %%xmm1 , %%xmm1 \n\t" // temp2[1]
|
||||
"xorps %%xmm2 , %%xmm2 \n\t" // temp2[2]
|
||||
"xorps %%xmm3 , %%xmm3 \n\t" // temp2[3]
|
||||
"movss (%8), %%xmm4 \n\t" // temp1[0]
|
||||
"movss 4(%8), %%xmm5 \n\t" // temp1[1]
|
||||
"movss 8(%8), %%xmm6 \n\t" // temp1[2]
|
||||
"movss 12(%8), %%xmm7 \n\t" // temp1[3]
|
||||
"shufps $0, %%xmm4, %%xmm4 \n\t"
|
||||
"shufps $0, %%xmm5, %%xmm5 \n\t"
|
||||
"shufps $0, %%xmm6, %%xmm6 \n\t"
|
||||
"shufps $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
|
||||
"movups (%4,%0,4), %%xmm12 \n\t" // 4 * a
|
||||
"movups (%5,%0,4), %%xmm13 \n\t" // 4 * a
|
||||
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
"mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm12 \n\t" // a * x
|
||||
"addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
|
||||
|
||||
"movups (%6,%0,4), %%xmm14 \n\t" // 4 * a
|
||||
"movups (%7,%0,4), %%xmm15 \n\t" // 4 * a
|
||||
|
||||
"movups %%xmm13 , %%xmm11 \n\t"
|
||||
"mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm13 \n\t" // a * x
|
||||
"addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm14 , %%xmm11 \n\t"
|
||||
"mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm14 \n\t" // a * x
|
||||
"addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm15 , %%xmm11 \n\t"
|
||||
"mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a
|
||||
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
|
||||
"mulps %%xmm8 , %%xmm15 \n\t" // a * x
|
||||
"addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
|
||||
|
||||
"movups %%xmm9, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"haddps %%xmm0, %%xmm0 \n\t"
|
||||
"haddps %%xmm1, %%xmm1 \n\t"
|
||||
"haddps %%xmm2, %%xmm2 \n\t"
|
||||
"haddps %%xmm3, %%xmm3 \n\t"
|
||||
"haddps %%xmm0, %%xmm0 \n\t"
|
||||
"haddps %%xmm1, %%xmm1 \n\t"
|
||||
"haddps %%xmm2, %%xmm2 \n\t"
|
||||
"haddps %%xmm3, %%xmm3 \n\t"
|
||||
|
||||
"movss %%xmm0 , (%9) \n\t" // save temp2
|
||||
"movss %%xmm1 , 4(%9) \n\t" // save temp2
|
||||
"movss %%xmm2 , 8(%9) \n\t" // save temp2
|
||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
"r" (a1), // 5
|
||||
"r" (a2), // 6
|
||||
"r" (a3), // 7
|
||||
"r" (temp1), // 8
|
||||
"r" (temp2) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "zaxpy_microk_bulldozer-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4
|
||||
|
||||
static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register ix = 0;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
|
||||
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
|
||||
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
|
||||
#endif
|
||||
|
||||
ix+=4 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT da[2];
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -4;
|
||||
|
||||
if ( n1 )
|
||||
{
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
zaxpy_kernel_4(n1, x, y , &da );
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
i++ ;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
inc_x *=2;
|
||||
inc_y *=2;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4 1
|
||||
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vmovddup (%4), %%xmm0 \n\t" // real part of alpha
|
||||
"vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"prefetcht0 768(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x
|
||||
"vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x
|
||||
"vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x
|
||||
"vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x
|
||||
"prefetcht0 768(%3,%0,8) \n\t"
|
||||
|
||||
#if !defined(CONJ)
|
||||
"vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
|
||||
"vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t"
|
||||
|
||||
"vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
|
||||
"vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
"vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
|
||||
"vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t"
|
||||
|
||||
"vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
|
||||
"vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t"
|
||||
|
||||
"vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t"
|
||||
"vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t"
|
||||
"vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t"
|
||||
"vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t"
|
||||
|
||||
#else
|
||||
|
||||
"vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
|
||||
"vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
|
||||
"vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
|
||||
"vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
|
||||
"vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
|
||||
|
||||
"vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
|
||||
"vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
|
||||
|
||||
"vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
|
||||
"vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
|
||||
|
||||
"vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
|
||||
"vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
|
||||
|
||||
"vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
|
||||
"vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t"
|
||||
"vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
|
||||
|
||||
"vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t"
|
||||
"vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t"
|
||||
"vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t"
|
||||
"vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t"
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
"vmovups %%xmm12, (%3,%0,8) \n\t"
|
||||
"vmovups %%xmm13, 16(%3,%0,8) \n\t"
|
||||
"vmovups %%xmm14, 32(%3,%0,8) \n\t"
|
||||
"vmovups %%xmm15, 48(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -222,8 +222,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 )
|
||||
VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $8*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 8*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -362,8 +362,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
|
||||
VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $4*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 4*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
@ -491,8 +491,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
|
||||
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
|
||||
|
||||
addq $6*SIZE, BO
|
||||
addq $2*SIZE, AO
|
||||
addq $ 6*SIZE, BO
|
||||
addq $ 2*SIZE, AO
|
||||
decq %rax
|
||||
.endm
|
||||
|
||||
|
|
Loading…
Reference in New Issue