Merge pull request #440 from wernsaar/develop

optimizations for leve1 and level2 blas functions
This commit is contained in:
Zhang Xianyi 2014-08-28 12:43:54 +08:00
commit 2702323f7d
42 changed files with 4868 additions and 31 deletions

View File

@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@ -350,6 +350,7 @@ ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -wd981
endif
ifeq ($(USE_OPENMP), 1)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))

View File

@ -35,7 +35,10 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto \
ssymv.goto dsymv.goto \
sdot.goto ddot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
chemv.goto zhemv.goto \
chemm.goto zhemm.goto \
cherk.goto zherk.goto \
cher2k.goto zher2k.goto \
@ -53,7 +56,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
sger.acml dger.acml \
ssymv.acml dsymv.acml \
sdot.acml ddot.acml \
saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
ssymv.acml dsymv.acml csymv.acml zsymv.acml \
chemv.acml zhemv.acml \
chemm.acml zhemm.acml \
cherk.acml zherk.acml \
cher2k.acml zher2k.acml \
@ -71,7 +77,10 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
sger.atlas dger.atlas \
ssymv.atlas dsymv.atlas \
sdot.atlas ddot.atlas \
saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
chemv.atlas zhemv.atlas \
chemm.acml zhemm.acml \
chemm.atlas zhemm.atlas \
cherk.atlas zherk.atlas \
@ -90,7 +99,10 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
sger.mkl dger.mkl \
ssymv.mkl dsymv.mkl \
sdot.mkl ddot.mkl \
saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
chemv.mkl zhemv.mkl \
chemm.mkl zhemm.mkl \
cherk.mkl zherk.mkl \
cher2k.mkl zher2k.mkl \
@ -100,7 +112,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
all :: goto atlas acml mkl
all :: goto mkl atlas acml
##################################### Slinpack ####################################################
slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
@ -732,6 +744,32 @@ dsymv.atlas : dsymv.$(SUFFIX)
dsymv.mkl : dsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Csymv ####################################################
csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
csymv.acml : csymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
csymv.atlas : csymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
csymv.mkl : csymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dsymv ####################################################
zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zsymv.acml : zsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zsymv.atlas : zsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zsymv.mkl : zsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sgeev ####################################################
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
@ -896,6 +934,115 @@ zpotrf.atlas : zpotrf.$(SUFFIX)
zpotrf.mkl : zpotrf.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Chemv ####################################################
chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
chemv.acml : chemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
chemv.atlas : chemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
chemv.mkl : chemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zhemv ####################################################
zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zhemv.acml : zhemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zhemv.atlas : zhemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zhemv.mkl : zhemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sdot ####################################################
sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
sdot.acml : sdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
sdot.atlas : sdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
sdot.mkl : sdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ddot ####################################################
ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
ddot.acml : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ddot.atlas : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ddot.mkl : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
saxpy.acml : saxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
saxpy.atlas : saxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
saxpy.mkl : saxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Daxpy ####################################################
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
daxpy.acml : daxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
daxpy.atlas : daxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
daxpy.mkl : daxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Caxpy ####################################################
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
caxpy.acml : caxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
caxpy.atlas : caxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
caxpy.mkl : caxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zaxpy ####################################################
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zaxpy.acml : zaxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zaxpy.atlas : zaxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zaxpy.mkl : zaxpy.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
###################################################################################################
@ -1037,6 +1184,12 @@ ssymv.$(SUFFIX) : symv.c
dsymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
csymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zsymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sgeev.$(SUFFIX) : geev.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@ -1073,7 +1226,29 @@ cpotrf.$(SUFFIX) : potrf.c
zpotrf.$(SUFFIX) : potrf.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
chemv.$(SUFFIX) : hemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zhemv.$(SUFFIX) : hemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
ddot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
daxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
caxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zaxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

201
benchmark/axpy.c Normal file
View File

@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AXPY
#ifdef COMPLEX
#ifdef DOUBLE
#define AXPY BLASFUNC(zaxpy)
#else
#define AXPY BLASFUNC(caxpy)
#endif
#else
#ifdef DOUBLE
#define AXPY BLASFUNC(daxpy)
#else
#define AXPY BLASFUNC(saxpy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/dot.c Normal file
View File

@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(ddot)
#else
#define DOT BLASFUNC(sdot)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);

208
benchmark/hemv.c Normal file
View File

@ -0,0 +1,208 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HEMV
#ifdef DOUBLE
#define HEMV BLASFUNC(zhemv)
#else
#define HEMV BLASFUNC(chemv)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int MAIN__(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

70
kernel/arm/symv_L.c Normal file
View File

@ -0,0 +1,70 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
#if 0
if ( m != offset )
printf("Symv_L: m=%d offset=%d\n",m,offset);
#endif
jx = 0;
jy = 0;
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
y[jy] += temp1 * a[j*lda+j];
iy = jy;
ix = jx;
for (i=j+1; i<m; i++)
{
ix += inc_x;
iy += inc_y;
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
}
y[jy] += alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}

71
kernel/arm/symv_U.c Normal file
View File

@ -0,0 +1,71 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
#if 0
if( m != offset )
printf("Symv_U: m=%d offset=%d\n",m,offset);
#endif
BLASLONG m1 = m - offset;
jx = m1 * inc_x;
jy = m1 * inc_y;
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
iy = 0;
ix = 0;
for (i=0; i<j; i++)
{
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
ix += inc_x;
iy += inc_y;
}
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}

View File

@ -1,3 +1,15 @@
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n.c
SGEMVTKERNEL = sgemv_t.c

View File

@ -1,5 +1,17 @@
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n.c
SGEMVTKERNEL = sgemv_t.c
DGEMVNKERNEL = dgemv_n.c
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S

131
kernel/x86_64/caxpy.c Normal file
View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c"
#endif
#ifndef HAVE_KERNEL_8
static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
BLASLONG register ix = 0;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
#endif
ix+=4 ;
i+=2 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT da[2];
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
if ( n1 )
{
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_8(n1, x, y , &da );
ix = 2 * n1;
}
i = n1;
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
i++ ;
ix += 2;
}
return(0);
}
inc_x *=2;
inc_y *=2;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,135 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 768(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
"prefetcht0 768(%3,%0,4) \n\t"
#if !defined(CONJ)
"vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t"
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t"
"vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t"
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t"
"vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t"
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t"
"vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t"
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm10, %%xmm10 \n\t"
"vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t"
"vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t"
"vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t"
"vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t"
#else
"vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t"
"vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t"
"vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t"
"vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t"
#endif
"vmovups %%xmm12, (%3,%0,4) \n\t"
"vmovups %%xmm13, 16(%3,%0,4) \n\t"
"vmovups %%xmm14, 32(%3,%0,4) \n\t"
"vmovups %%xmm15, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 )
addq $6*SIZE, BO
addq $16*SIZE, AO
addq $ 6*SIZE, BO
addq $ 16*SIZE, AO
decq %rax
.endm
@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
addq $6*SIZE, BO
addq $8*SIZE, AO
addq $ 6*SIZE, BO
addq $ 8*SIZE, AO
decq %rax
.endm
@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
addq $6*SIZE, BO
addq $4*SIZE, AO
addq $ 6*SIZE, BO
addq $ 4*SIZE, AO
decq %rax
.endm
@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
addq $6*SIZE, BO
addq $2*SIZE, AO
addq $ 6*SIZE, BO
addq $ 2*SIZE, AO
decq %rax
.endm

105
kernel/x86_64/daxpy.c Normal file
View File

@ -0,0 +1,105 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER)
#include "daxpy_microk_bulldozer-2.c"
#endif
#ifndef HAVE_KERNEL_8
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
while(i < n)
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
if ( n1 )
daxpy_kernel_8(n1, x, y , &da );
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return(0);
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,82 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vmovddup (%4), %%xmm0 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 768(%3,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
".align 2 \n\t"
"vmovups %%xmm8 , (%3,%0,8) \n\t"
"vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x
".align 2 \n\t"
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"vmovups %%xmm9 , 16(%3,%0,8) \n\t"
"prefetcht0 768(%2,%0,8) \n\t"
".align 2 \n\t"
"vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x
"vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"vmovups %%xmm10, 32(%3,%0,8) \n\t"
"vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x
"vmovups %%xmm11, 48(%3,%0,8) \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,91 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%4), %%xmm0 \n\t" // alpha
"shufpd $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t"
// "prefetcht0 192(%3,%0,8) \n\t"
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
"movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
"mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x
"mulpd %%xmm0 , %%xmm13 \n\t"
"mulpd %%xmm0 , %%xmm14 \n\t"
"mulpd %%xmm0 , %%xmm15 \n\t"
"addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x
"addpd %%xmm13, %%xmm9 \n\t"
"addpd %%xmm14, %%xmm10 \n\t"
"addpd %%xmm15, %%xmm11 \n\t"
"movups %%xmm8 , (%3,%0,8) \n\t"
"movups %%xmm9 , 16(%3,%0,8) \n\t"
"movups %%xmm10, 32(%3,%0,8) \n\t"
"movups %%xmm11, 48(%3,%0,8) \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

110
kernel/x86_64/ddot.c Normal file
View File

@ -0,0 +1,110 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "ddot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_8
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
if ( n1 )
ddot_kernel_8(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -0,0 +1,84 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
"vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
"vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y
"vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y
"vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y
"vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
"vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
"vmovsd %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,94 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
"xorpd %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
"movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
"movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
"movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
"movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
"mulpd %%xmm8 , %%xmm12 \n\t"
"mulpd %%xmm9 , %%xmm13 \n\t"
"mulpd %%xmm10, %%xmm14 \n\t"
"mulpd %%xmm11, %%xmm15 \n\t"
"addpd %%xmm12, %%xmm4 \n\t"
"addpd %%xmm13, %%xmm5 \n\t"
"addpd %%xmm14, %%xmm6 \n\t"
"addpd %%xmm15, %%xmm7 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"addpd %%xmm5, %%xmm4 \n\t"
"addpd %%xmm7, %%xmm6 \n\t"
"addpd %%xmm6, %%xmm4 \n\t"
"haddpd %%xmm4, %%xmm4 \n\t"
"movsd %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HASWELL)
#include "dgemv_n_microk_haswell-2.c"
#elif defined(NEHALEM)
#include "dgemv_n_microk_nehalem-2.c"
#endif

View File

@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16x4 1
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movddup (%2), %%xmm12 \n\t" // x0
"movddup 8(%2), %%xmm13 \n\t" // x1
"movddup 16(%2), %%xmm14 \n\t" // x2
"movddup 24(%2), %%xmm15 \n\t" // x3
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 192(%3,%0,8) \n\t"
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
"movups 32(%3,%0,8), %%xmm6 \n\t" // 2 * y
"movups 48(%3,%0,8), %%xmm7 \n\t" // 2 * y
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
"movups 32(%4,%0,8), %%xmm10 \n\t" // 2 * a
"movups 48(%4,%0,8), %%xmm11 \n\t" // 2 * a
"prefetcht0 192(%4,%0,8) \n\t"
"mulpd %%xmm12 , %%xmm8 \n\t" // a * x
"mulpd %%xmm12 , %%xmm9 \n\t" // a * x
"mulpd %%xmm12 , %%xmm10 \n\t" // a * x
"mulpd %%xmm12 , %%xmm11 \n\t" // a * x
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
"prefetcht0 192(%5,%0,8) \n\t"
"movups (%5,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%5,%0,8), %%xmm9 \n\t" // 2 * a
"movups 32(%5,%0,8), %%xmm10 \n\t" // 2 * a
"movups 48(%5,%0,8), %%xmm11 \n\t" // 2 * a
"mulpd %%xmm13 , %%xmm8 \n\t" // a * x
"mulpd %%xmm13 , %%xmm9 \n\t" // a * x
"mulpd %%xmm13 , %%xmm10 \n\t" // a * x
"mulpd %%xmm13 , %%xmm11 \n\t" // a * x
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
"prefetcht0 192(%6,%0,8) \n\t"
"movups (%6,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%6,%0,8), %%xmm9 \n\t" // 2 * a
"movups 32(%6,%0,8), %%xmm10 \n\t" // 2 * a
"movups 48(%6,%0,8), %%xmm11 \n\t" // 2 * a
"mulpd %%xmm14 , %%xmm8 \n\t" // a * x
"mulpd %%xmm14 , %%xmm9 \n\t" // a * x
"mulpd %%xmm14 , %%xmm10 \n\t" // a * x
"mulpd %%xmm14 , %%xmm11 \n\t" // a * x
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
"prefetcht0 192(%7,%0,8) \n\t"
"movups (%7,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%7,%0,8), %%xmm9 \n\t" // 2 * a
"movups 32(%7,%0,8), %%xmm10 \n\t" // 2 * a
"movups 48(%7,%0,8), %%xmm11 \n\t" // 2 * a
"mulpd %%xmm15 , %%xmm8 \n\t" // a * x
"mulpd %%xmm15 , %%xmm9 \n\t" // a * x
"mulpd %%xmm15 , %%xmm10 \n\t" // a * x
"mulpd %%xmm15 , %%xmm11 \n\t" // a * x
"addpd %%xmm8 , %%xmm4 \n\t" // y += a * x
"addpd %%xmm9 , %%xmm5 \n\t" // y += a * x
"addpd %%xmm10 , %%xmm6 \n\t" // y += a * x
"addpd %%xmm11 , %%xmm7 \n\t" // y += a * x
"movups %%xmm4, (%3,%0,8) \n\t" // 4 * y
"movups %%xmm5, 16(%3,%0,8) \n\t" // 4 * y
"movups %%xmm6, 32(%3,%0,8) \n\t" // 4 * y
"movups %%xmm7, 48(%3,%0,8) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

299
kernel/x86_64/dsymv_L.c Normal file
View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "dsymv_L_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_4x4
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
{
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
BLASLONG i;
for (i=from; i<to; i+=4)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
y[i+1] += tmp1[0] * ap[0][i+1];
tmp2[0] += ap[0][i+1] * x[i+1];
y[i+1] += tmp1[1] * ap[1][i+1];
tmp2[1] += ap[1][i+1] * x[i+1];
y[i+1] += tmp1[2] * ap[2][i+1];
tmp2[2] += ap[2][i+1] * x[i+1];
y[i+1] += tmp1[3] * ap[3][i+1];
tmp2[3] += ap[3][i+1] * x[i+1];
y[i+2] += tmp1[0] * ap[0][i+2];
tmp2[0] += ap[0][i+2] * x[i+2];
y[i+2] += tmp1[1] * ap[1][i+2];
tmp2[1] += ap[1][i+2] * x[i+2];
y[i+2] += tmp1[2] * ap[2][i+2];
tmp2[2] += ap[2][i+2] * x[i+2];
y[i+2] += tmp1[3] * ap[3][i+2];
tmp2[3] += ap[3][i+2] * x[i+2];
y[i+3] += tmp1[0] * ap[0][i+3];
tmp2[0] += ap[0][i+3] * x[i+3];
y[i+3] += tmp1[1] * ap[1][i+3];
tmp2[1] += ap[1][i+3] * x[i+3];
y[i+3] += tmp1[2] * ap[2][i+3];
tmp2[2] += ap[2][i+3] * x[i+3];
y[i+3] += tmp1[3] * ap[3][i+3];
tmp2[3] += ap[3][i+3] * x[i+3];
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *ap[4];
#if 0
if ( m != offset )
printf("Symv_L: m=%d offset=%d\n",m,offset);
#endif
if ( (inc_x != 1) || (inc_y != 1) )
{
jx = 0;
jy = 0;
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
y[jy] += temp1 * a[j*lda+j];
iy = jy;
ix = jx;
for (i=j+1; i<m; i++)
{
ix += inc_x;
iy += inc_y;
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
}
y[jy] += alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}
BLASLONG offset1 = (offset/4)*4;
for (j=0; j<offset1; j+=4)
{
tmp1[0] = alpha * x[j];
tmp1[1] = alpha * x[j+1];
tmp1[2] = alpha * x[j+2];
tmp1[3] = alpha * x[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
ap[0] = &a[j*lda];
ap[1] = ap[0] + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
y[j] += tmp1[0] * ap[0][j];
y[j+1] += tmp1[1] * ap[1][j+1];
y[j+2] += tmp1[2] * ap[2][j+2];
y[j+3] += tmp1[3] * ap[3][j+3];
BLASLONG from = j+1;
if ( m - from >=12 )
{
BLASLONG m2 = (m/4)*4;
for (i=j+1; i<j+4; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
}
for (i=j+2; i<j+4; i++)
{
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
}
for (i=j+3; i<j+4; i++)
{
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
}
if ( m2 > j+4 )
dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
for (i=m2; i<m; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
}
}
else
{
for (i=j+1; i<j+4; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
}
for (i=j+2; i<j+4; i++)
{
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
}
for (i=j+3; i<j+4; i++)
{
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
}
for (i=j+4; i<m; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
}
}
y[j] += alpha * tmp2[0];
y[j+1] += alpha * tmp2[1];
y[j+2] += alpha * tmp2[2];
y[j+3] += alpha * tmp2[3];
}
for (j=offset1; j<offset; j++)
{
temp1 = alpha * x[j];
temp2 = 0.0;
y[j] += temp1 * a[j*lda+j];
BLASLONG from = j+1;
if ( m - from >=8 )
{
BLASLONG j1 = ((from + 4)/4)*4;
BLASLONG j2 = (m/4)*4;
for (i=from; i<j1; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
for (i=j1; i<j2; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
for (i=j2; i<m; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
}
else
{
for (i=from; i<m; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
}
y[j] += alpha * temp2;
}
return(0);
}

View File

@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__asm__ __volatile__
(
"vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
"vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
"vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
"vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
"vmovddup (%8), %%xmm4 \n\t" // temp1[0]
"vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
"vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
"vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
"vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
"vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
"vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
"vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
"addq $4 , %0 \n\t"
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vmovsd (%9), %%xmm4 \n\t"
"vmovsd 8(%9), %%xmm5 \n\t"
"vmovsd 16(%9), %%xmm6 \n\t"
"vmovsd 24(%9), %%xmm7 \n\t"
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a[0]), // 4
"r" (a[1]), // 5
"r" (a[2]), // 6
"r" (a[3]), // 8
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,132 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__asm__ __volatile__
(
"xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
"xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
"xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
"xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
"movsd (%8), %%xmm4 \n\t" // temp1[0]
"movsd 8(%8), %%xmm5 \n\t" // temp1[1]
"movsd 16(%8), %%xmm6 \n\t" // temp1[2]
"movsd 24(%8), %%xmm7 \n\t" // temp1[3]
"shufpd $0, %%xmm4, %%xmm4 \n\t"
"shufpd $0, %%xmm5, %%xmm5 \n\t"
"shufpd $0, %%xmm6, %%xmm6 \n\t"
"shufpd $0, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
"movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm12 \n\t" // a * x
"addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"movups %%xmm13 , %%xmm11 \n\t"
"mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm13 \n\t" // a * x
"addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"movups %%xmm14 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm14 \n\t" // a * x
"addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"addq $2 , %0 \n\t"
"movups %%xmm15 , %%xmm11 \n\t"
"mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm15 \n\t" // a * x
"addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"movsd (%9), %%xmm4 \n\t" // temp1[0]
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
"movsd 16(%9), %%xmm6 \n\t" // temp1[2]
"movsd 24(%9), %%xmm7 \n\t" // temp1[3]
"haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t"
"haddpd %%xmm2, %%xmm2 \n\t"
"haddpd %%xmm3, %%xmm3 \n\t"
"addsd %%xmm4, %%xmm0 \n\t"
"addsd %%xmm5, %%xmm1 \n\t"
"addsd %%xmm6, %%xmm2 \n\t"
"addsd %%xmm7, %%xmm3 \n\t"
"movsd %%xmm0 , (%9) \n\t" // save temp2
"movsd %%xmm1 , 8(%9) \n\t" // save temp2
"movsd %%xmm2 , 16(%9) \n\t" // save temp2
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a[0]), // 4
"r" (a[1]), // 5
"r" (a[2]), // 6
"r" (a[3]), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

273
kernel/x86_64/dsymv_U.c Normal file
View File

@ -0,0 +1,273 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "dsymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "dsymv_U_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_4x4
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT x;
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
FLOAT tp0;
FLOAT tp1;
FLOAT tp2;
FLOAT tp3;
BLASLONG i;
tp0 = temp1[0];
tp1 = temp1[1];
tp2 = temp1[2];
tp3 = temp1[3];
for (i=0; i<n; i++)
{
at0 = a0[i];
at1 = a1[i];
at2 = a2[i];
at3 = a3[i];
x = xp[i];
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
tmp2[0] += at0 * x;
tmp2[1] += at1 * x;
tmp2[2] += at2 * x;
tmp2[3] += at3 * x;
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
#ifndef HAVE_KERNEL_1x4
static void dsymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT x;
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
FLOAT tp0;
FLOAT tp1;
FLOAT tp2;
FLOAT tp3;
BLASLONG i;
tp0 = temp1[0];
tp1 = temp1[1];
tp2 = temp1[2];
tp3 = temp1[3];
for (i=from; i<to; i++)
{
at0 = a0[i];
at1 = a1[i];
at2 = a2[i];
at3 = a3[i];
x = xp[i];
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
tmp2[0] += at0 * x;
tmp2[1] += at1 * x;
tmp2[2] += at2 * x;
tmp2[3] += at3 * x;
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
static void dsymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT temp = 0.0;
FLOAT t1 = *temp1;
BLASLONG i;
for (i=0; i<(n/4)*4; i+=4)
{
at0 = a0[i];
at1 = a0[i+1];
at2 = a0[i+2];
at3 = a0[i+3];
yp[i] += t1 * at0;
temp += at0 * xp[i];
yp[i+1] += t1 * at1;
temp += at1 * xp[i+1];
yp[i+2] += t1 * at2;
temp += at2 * xp[i+2];
yp[i+3] += t1 * at3;
temp += at3 * xp[i+3];
}
*temp2 = temp;
}
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
BLASLONG j1;
BLASLONG j2;
BLASLONG m2;
FLOAT temp1;
FLOAT temp2;
FLOAT *xp, *yp;
FLOAT *a0,*a1,*a2,*a3;
FLOAT at0,at1,at2,at3;
FLOAT tmp1[4];
FLOAT tmp2[4];
#if 0
if( m != offset )
printf("Symv_U: m=%d offset=%d\n",m,offset);
#endif
BLASLONG m1 = m - offset;
BLASLONG mrange = m -m1;
if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
{
jx = m1 * inc_x;
jy = m1 * inc_y;
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
iy = 0;
ix = 0;
for (i=0; i<j; i++)
{
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
ix += inc_x;
iy += inc_y;
}
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}
xp = x;
yp = y;
m2 = m - ( mrange % 4 );
for (j=m1; j<m2; j+=4)
{
tmp1[0] = alpha * xp[j];
tmp1[1] = alpha * xp[j+1];
tmp1[2] = alpha * xp[j+2];
tmp1[3] = alpha * xp[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
a0 = &a[j*lda];
a1 = a0+lda;
a2 = a1+lda;
a3 = a2+lda;
j1 = (j/8)*8;
if ( j1 )
dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
if ( j1 < j )
dsymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
j2 = 0;
for ( j1 = j ; j1 < j+4 ; j1++ )
{
temp1 = tmp1[j2];
temp2 = tmp2[j2];
a0 = &a[j1*lda];
for ( i=j ; i<j1; i++ )
{
yp[i] += temp1 * a0[i];
temp2 += a0[i] * xp[i];
}
y[j1] += temp1 * a0[j1] + alpha * temp2;
j2++;
}
}
for ( ; j<m; j++)
{
temp1 = alpha * xp[j];
temp2 = 0.0;
a0 = &a[j*lda];
FLOAT at0;
j1 = (j/8)*8;
if ( j1 )
dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
for (i=j1 ; i<j; i++)
{
at0 = a0[i];
yp[i] += temp1 * at0;
temp2 += at0 * xp[i];
}
yp[j] += temp1 * a0[j] + alpha * temp2;
}
return(0);
}

View File

@ -0,0 +1,130 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
"vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
"vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
"vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
"vmovddup (%8), %%xmm4 \n\t" // temp1[0]
"vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
"vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
"vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
"vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
"vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
"vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
"vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
"vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
"vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
"vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"addq $4 , %0 \n\t"
"vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
"vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
"subq $4 , %1 \n\t"
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
"r" (a1), // 5
"r" (a2), // 6
"r" (a3), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,125 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
"xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
"xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
"xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
"movsd (%8), %%xmm4 \n\t" // temp1[0]
"movsd 8(%8), %%xmm5 \n\t" // temp1[1]
"movsd 16(%8), %%xmm6 \n\t" // temp1[2]
"movsd 24(%8), %%xmm7 \n\t" // temp1[3]
"shufpd $0, %%xmm4, %%xmm4 \n\t"
"shufpd $0, %%xmm5, %%xmm5 \n\t"
"shufpd $0, %%xmm6, %%xmm6 \n\t"
"shufpd $0, %%xmm7, %%xmm7 \n\t"
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
"movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
"movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
"mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm12 \n\t" // a * x
"addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
"movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
"movups %%xmm13 , %%xmm11 \n\t"
"mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm13 \n\t" // a * x
"addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"movups %%xmm14 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm14 \n\t" // a * x
"addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"addq $2 , %0 \n\t"
"movups %%xmm15 , %%xmm11 \n\t"
"mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
"addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulpd %%xmm8 , %%xmm15 \n\t" // a * x
"addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"subq $2 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t"
"haddpd %%xmm2, %%xmm2 \n\t"
"haddpd %%xmm3, %%xmm3 \n\t"
"movsd %%xmm0 , (%9) \n\t" // save temp2
"movsd %%xmm1 , 8(%9) \n\t" // save temp2
"movsd %%xmm2 , 16(%9) \n\t" // save temp2
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
"r" (a1), // 5
"r" (a2), // 6
"r" (a3), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

103
kernel/x86_64/saxpy.c Normal file
View File

@ -0,0 +1,103 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "saxpy_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_16
static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
while(i < n)
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -16;
if ( n1 )
saxpy_kernel_16(n1, x, y , &da );
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return(0);
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,91 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%4), %%xmm0 \n\t" // alpha
"shufps $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 192(%2,%0,4) \n\t"
// "prefetcht0 192(%3,%0,4) \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y
"movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y
"movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y
"mulps %%xmm0 , %%xmm12 \n\t" // alpha * x
"mulps %%xmm0 , %%xmm13 \n\t"
"mulps %%xmm0 , %%xmm14 \n\t"
"mulps %%xmm0 , %%xmm15 \n\t"
"addps %%xmm12, %%xmm8 \n\t" // y += alpha *x
"addps %%xmm13, %%xmm9 \n\t"
"addps %%xmm14, %%xmm10 \n\t"
"addps %%xmm15, %%xmm11 \n\t"
"movups %%xmm8 , (%3,%0,4) \n\t"
"movups %%xmm9 , 16(%3,%0,4) \n\t"
"movups %%xmm10, 32(%3,%0,4) \n\t"
"movups %%xmm11, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

109
kernel/x86_64/sdot.c Normal file
View File

@ -0,0 +1,109 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sdot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_16
static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -16;
if ( n1 )
sdot_kernel_16(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -0,0 +1,85 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y
"vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y
"vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y
"vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
"vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vmovss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,94 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorps %%xmm4, %%xmm4 \n\t"
"xorps %%xmm5, %%xmm5 \n\t"
"xorps %%xmm6, %%xmm6 \n\t"
"xorps %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x
"movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x
"movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x
"mulps %%xmm8 , %%xmm12 \n\t"
"mulps %%xmm9 , %%xmm13 \n\t"
"mulps %%xmm10, %%xmm14 \n\t"
"mulps %%xmm11, %%xmm15 \n\t"
"addps %%xmm12, %%xmm4 \n\t"
"addps %%xmm13, %%xmm5 \n\t"
"addps %%xmm14, %%xmm6 \n\t"
"addps %%xmm15, %%xmm7 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"addps %%xmm5, %%xmm4 \n\t"
"addps %%xmm7, %%xmm6 \n\t"
"addps %%xmm6, %%xmm4 \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"movss %%xmm4, (%4) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
VFMADD231PS_( %ymm15,%ymm3,%ymm1 )
addq $6*SIZE, BO
addq $16*SIZE, AO
addq $ 6*SIZE, BO
addq $ 16*SIZE, AO
decq %rax
.endm
@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
addq $6*SIZE, BO
addq $8*SIZE, AO
addq $ 6*SIZE, BO
addq $ 8*SIZE, AO
decq %rax
.endm
@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %xmm12,%xmm2,%xmm0 )
VFMADD231PS_( %xmm14,%xmm3,%xmm0 )
addq $6*SIZE, BO
addq $4*SIZE, AO
addq $ 6*SIZE, BO
addq $ 4*SIZE, AO
decq %rax
.endm
@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
VFMADD231SS_( %xmm15,%xmm3,%xmm1 )
addq $6*SIZE, BO
addq $2*SIZE, AO
addq $ 6*SIZE, BO
addq $ 2*SIZE, AO
decq %rax
.endm
@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
addq $6*SIZE, BO
addq $1*SIZE, AO
addq $ 6*SIZE, BO
addq $ 1*SIZE, AO
decq %rax
.endm

299
kernel/x86_64/ssymv_L.c Normal file
View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "ssymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ssymv_L_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_4x4
static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
{
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
BLASLONG i;
for (i=from; i<to; i+=4)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
y[i+1] += tmp1[0] * ap[0][i+1];
tmp2[0] += ap[0][i+1] * x[i+1];
y[i+1] += tmp1[1] * ap[1][i+1];
tmp2[1] += ap[1][i+1] * x[i+1];
y[i+1] += tmp1[2] * ap[2][i+1];
tmp2[2] += ap[2][i+1] * x[i+1];
y[i+1] += tmp1[3] * ap[3][i+1];
tmp2[3] += ap[3][i+1] * x[i+1];
y[i+2] += tmp1[0] * ap[0][i+2];
tmp2[0] += ap[0][i+2] * x[i+2];
y[i+2] += tmp1[1] * ap[1][i+2];
tmp2[1] += ap[1][i+2] * x[i+2];
y[i+2] += tmp1[2] * ap[2][i+2];
tmp2[2] += ap[2][i+2] * x[i+2];
y[i+2] += tmp1[3] * ap[3][i+2];
tmp2[3] += ap[3][i+2] * x[i+2];
y[i+3] += tmp1[0] * ap[0][i+3];
tmp2[0] += ap[0][i+3] * x[i+3];
y[i+3] += tmp1[1] * ap[1][i+3];
tmp2[1] += ap[1][i+3] * x[i+3];
y[i+3] += tmp1[2] * ap[2][i+3];
tmp2[2] += ap[2][i+3] * x[i+3];
y[i+3] += tmp1[3] * ap[3][i+3];
tmp2[3] += ap[3][i+3] * x[i+3];
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
FLOAT temp1;
FLOAT temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *ap[4];
#if 0
if ( m != offset )
printf("Symv_L: m=%d offset=%d\n",m,offset);
#endif
if ( (inc_x != 1) || (inc_y != 1) )
{
jx = 0;
jy = 0;
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
y[jy] += temp1 * a[j*lda+j];
iy = jy;
ix = jx;
for (i=j+1; i<m; i++)
{
ix += inc_x;
iy += inc_y;
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
}
y[jy] += alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}
BLASLONG offset1 = (offset/4)*4;
for (j=0; j<offset1; j+=4)
{
tmp1[0] = alpha * x[j];
tmp1[1] = alpha * x[j+1];
tmp1[2] = alpha * x[j+2];
tmp1[3] = alpha * x[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
ap[0] = &a[j*lda];
ap[1] = ap[0] + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
y[j] += tmp1[0] * ap[0][j];
y[j+1] += tmp1[1] * ap[1][j+1];
y[j+2] += tmp1[2] * ap[2][j+2];
y[j+3] += tmp1[3] * ap[3][j+3];
BLASLONG from = j+1;
if ( m - from >=12 )
{
BLASLONG m2 = (m/4)*4;
for (i=j+1; i<j+4; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
}
for (i=j+2; i<j+4; i++)
{
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
}
for (i=j+3; i<j+4; i++)
{
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
}
if ( m2 > j+4 )
ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
for (i=m2; i<m; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
}
}
else
{
for (i=j+1; i<j+4; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
}
for (i=j+2; i<j+4; i++)
{
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
}
for (i=j+3; i<j+4; i++)
{
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
}
for (i=j+4; i<m; i++)
{
y[i] += tmp1[0] * ap[0][i];
tmp2[0] += ap[0][i] * x[i];
y[i] += tmp1[1] * ap[1][i];
tmp2[1] += ap[1][i] * x[i];
y[i] += tmp1[2] * ap[2][i];
tmp2[2] += ap[2][i] * x[i];
y[i] += tmp1[3] * ap[3][i];
tmp2[3] += ap[3][i] * x[i];
}
}
y[j] += alpha * tmp2[0];
y[j+1] += alpha * tmp2[1];
y[j+2] += alpha * tmp2[2];
y[j+3] += alpha * tmp2[3];
}
for (j=offset1; j<offset; j++)
{
temp1 = alpha * x[j];
temp2 = 0.0;
y[j] += temp1 * a[j*lda+j];
BLASLONG from = j+1;
if ( m - from >=8 )
{
BLASLONG j1 = ((from + 4)/4)*4;
BLASLONG j2 = (m/4)*4;
for (i=from; i<j1; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
for (i=j1; i<j2; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
for (i=j2; i<m; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
}
else
{
for (i=from; i<m; i++)
{
y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[i];
}
}
y[j] += alpha * temp2;
}
return(0);
}

View File

@ -0,0 +1,122 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__asm__ __volatile__
(
"vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
"vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
"vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
"vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
"vbroadcastss (%8), %%xmm4 \n\t" // temp1[0]
"vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1]
"vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[2]
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3]
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
"vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y
"vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a
"vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a
"vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
"vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a
"vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
"vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
"addq $4 , %0 \n\t"
"vmovups %%xmm9 , -16(%3,%0,4) \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vmovss (%9), %%xmm4 \n\t"
"vmovss 4(%9), %%xmm5 \n\t"
"vmovss 8(%9), %%xmm6 \n\t"
"vmovss 12(%9), %%xmm7 \n\t"
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
"vaddss %%xmm4, %%xmm0, %%xmm0 \n\t"
"vaddss %%xmm5, %%xmm1, %%xmm1 \n\t"
"vaddss %%xmm6, %%xmm2, %%xmm2 \n\t"
"vaddss %%xmm7, %%xmm3, %%xmm3 \n\t"
"vmovss %%xmm0 , (%9) \n\t" // save temp2
"vmovss %%xmm1 , 4(%9) \n\t" // save temp2
"vmovss %%xmm2 , 8(%9) \n\t" // save temp2
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a[0]), // 4
"r" (a[1]), // 5
"r" (a[2]), // 6
"r" (a[3]), // 8
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__asm__ __volatile__
(
"xorps %%xmm0 , %%xmm0 \n\t" // temp2[0]
"xorps %%xmm1 , %%xmm1 \n\t" // temp2[1]
"xorps %%xmm2 , %%xmm2 \n\t" // temp2[2]
"xorps %%xmm3 , %%xmm3 \n\t" // temp2[3]
"movss (%8), %%xmm4 \n\t" // temp1[0]
"movss 4(%8), %%xmm5 \n\t" // temp1[1]
"movss 8(%8), %%xmm6 \n\t" // temp1[2]
"movss 12(%8), %%xmm7 \n\t" // temp1[3]
"shufps $0, %%xmm4, %%xmm4 \n\t"
"shufps $0, %%xmm5, %%xmm5 \n\t"
"shufps $0, %%xmm6, %%xmm6 \n\t"
"shufps $0, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm12 \n\t" // 4 * a
"movups (%5,%0,4), %%xmm13 \n\t" // 4 * a
"movups %%xmm12 , %%xmm11 \n\t"
"mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm12 \n\t" // a * x
"addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"movups (%6,%0,4), %%xmm14 \n\t" // 4 * a
"movups (%7,%0,4), %%xmm15 \n\t" // 4 * a
"movups %%xmm13 , %%xmm11 \n\t"
"mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm13 \n\t" // a * x
"addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"movups %%xmm14 , %%xmm11 \n\t"
"mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm14 \n\t" // a * x
"addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"movups %%xmm15 , %%xmm11 \n\t"
"mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm15 \n\t" // a * x
"addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"movups %%xmm9, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"movss (%9), %%xmm4 \n\t" // temp1[0]
"movss 4(%9), %%xmm5 \n\t" // temp1[1]
"movss 8(%9), %%xmm6 \n\t" // temp1[2]
"movss 12(%9), %%xmm7 \n\t" // temp1[3]
"haddps %%xmm0, %%xmm0 \n\t"
"haddps %%xmm1, %%xmm1 \n\t"
"haddps %%xmm2, %%xmm2 \n\t"
"haddps %%xmm3, %%xmm3 \n\t"
"haddps %%xmm0, %%xmm0 \n\t"
"haddps %%xmm1, %%xmm1 \n\t"
"haddps %%xmm2, %%xmm2 \n\t"
"haddps %%xmm3, %%xmm3 \n\t"
"addss %%xmm4, %%xmm0 \n\t"
"addss %%xmm5, %%xmm1 \n\t"
"addss %%xmm6, %%xmm2 \n\t"
"addss %%xmm7, %%xmm3 \n\t"
"movss %%xmm0 , (%9) \n\t" // save temp2
"movss %%xmm1 , 4(%9) \n\t" // save temp2
"movss %%xmm2 , 8(%9) \n\t" // save temp2
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
:
"r" (from), // 0
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a[0]), // 4
"r" (a[1]), // 5
"r" (a[2]), // 6
"r" (a[3]), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

273
kernel/x86_64/ssymv_U.c Normal file
View File

@ -0,0 +1,273 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "ssymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ssymv_U_microk_nehalem-2.c"
#endif
#ifndef HAVE_KERNEL_4x4
static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT x;
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
FLOAT tp0;
FLOAT tp1;
FLOAT tp2;
FLOAT tp3;
BLASLONG i;
tp0 = temp1[0];
tp1 = temp1[1];
tp2 = temp1[2];
tp3 = temp1[3];
for (i=0; i<n; i++)
{
at0 = a0[i];
at1 = a1[i];
at2 = a2[i];
at3 = a3[i];
x = xp[i];
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
tmp2[0] += at0 * x;
tmp2[1] += at1 * x;
tmp2[2] += at2 * x;
tmp2[3] += at3 * x;
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
#ifndef HAVE_KERNEL_1x4
static void ssymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT x;
FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
FLOAT tp0;
FLOAT tp1;
FLOAT tp2;
FLOAT tp3;
BLASLONG i;
tp0 = temp1[0];
tp1 = temp1[1];
tp2 = temp1[2];
tp3 = temp1[3];
for (i=from; i<to; i++)
{
at0 = a0[i];
at1 = a1[i];
at2 = a2[i];
at3 = a3[i];
x = xp[i];
yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
tmp2[0] += at0 * x;
tmp2[1] += at1 * x;
tmp2[2] += at2 * x;
tmp2[3] += at3 * x;
}
temp2[0] += tmp2[0];
temp2[1] += tmp2[1];
temp2[2] += tmp2[2];
temp2[3] += tmp2[3];
}
#endif
static void ssymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
{
FLOAT at0,at1,at2,at3;
FLOAT temp = 0.0;
FLOAT t1 = *temp1;
BLASLONG i;
for (i=0; i<(n/4)*4; i+=4)
{
at0 = a0[i];
at1 = a0[i+1];
at2 = a0[i+2];
at3 = a0[i+3];
yp[i] += t1 * at0;
temp += at0 * xp[i];
yp[i+1] += t1 * at1;
temp += at1 * xp[i+1];
yp[i+2] += t1 * at2;
temp += at2 * xp[i+2];
yp[i+3] += t1 * at3;
temp += at3 * xp[i+3];
}
*temp2 = temp;
}
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG jx,jy;
BLASLONG j;
BLASLONG j1;
BLASLONG j2;
BLASLONG m2;
FLOAT temp1;
FLOAT temp2;
FLOAT *xp, *yp;
FLOAT *a0,*a1,*a2,*a3;
FLOAT at0,at1,at2,at3;
FLOAT tmp1[4];
FLOAT tmp2[4];
#if 0
if( m != offset )
printf("Symv_U: m=%d offset=%d\n",m,offset);
#endif
BLASLONG m1 = m - offset;
BLASLONG mrange = m -m1;
if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
{
jx = m1 * inc_x;
jy = m1 * inc_y;
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
temp2 = 0.0;
iy = 0;
ix = 0;
for (i=0; i<j; i++)
{
y[iy] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * x[ix];
ix += inc_x;
iy += inc_y;
}
y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
jx += inc_x;
jy += inc_y;
}
return(0);
}
xp = x;
yp = y;
m2 = m - ( mrange % 4 );
for (j=m1; j<m2; j+=4)
{
tmp1[0] = alpha * xp[j];
tmp1[1] = alpha * xp[j+1];
tmp1[2] = alpha * xp[j+2];
tmp1[3] = alpha * xp[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
a0 = &a[j*lda];
a1 = a0+lda;
a2 = a1+lda;
a3 = a2+lda;
j1 = (j/8)*8;
if ( j1 )
ssymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
if ( j1 < j )
ssymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
j2 = 0;
for ( j1 = j ; j1 < j+4 ; j1++ )
{
temp1 = tmp1[j2];
temp2 = tmp2[j2];
a0 = &a[j1*lda];
for ( i=j ; i<j1; i++ )
{
yp[i] += temp1 * a0[i];
temp2 += a0[i] * xp[i];
}
y[j1] += temp1 * a0[j1] + alpha * temp2;
j2++;
}
}
for ( ; j<m; j++)
{
temp1 = alpha * xp[j];
temp2 = 0.0;
a0 = &a[j*lda];
FLOAT at0;
j1 = (j/8)*8;
if ( j1 )
ssymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
for (i=j1 ; i<j; i++)
{
at0 = a0[i];
yp[i] += temp1 * at0;
temp2 += at0 * xp[i];
}
yp[j] += temp1 * a0[j] + alpha * temp2;
}
return(0);
}

View File

@ -0,0 +1,114 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
"vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
"vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
"vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
"vbroadcastss (%8), %%xmm4 \n\t" // temp1[0]
"vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1]
"vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1]
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm12 \n\t" // 4 * a
"vmovups (%5,%0,4), %%xmm13 \n\t" // 4 * a
"vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
"vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"vmovups (%6,%0,4), %%xmm14 \n\t" // 4 * a
"vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
"vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"vmovups (%7,%0,4), %%xmm15 \n\t" // 4 * a
"vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
"vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
"vmovups %%xmm9 , (%3,%0,4) \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
"vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
"vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
"vmovss %%xmm0 , (%9) \n\t" // save temp2
"vmovss %%xmm1 , 4(%9) \n\t" // save temp2
"vmovss %%xmm2 , 8(%9) \n\t" // save temp2
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
"r" (a1), // 5
"r" (a2), // 6
"r" (a3), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,130 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorps %%xmm0 , %%xmm0 \n\t" // temp2[0]
"xorps %%xmm1 , %%xmm1 \n\t" // temp2[1]
"xorps %%xmm2 , %%xmm2 \n\t" // temp2[2]
"xorps %%xmm3 , %%xmm3 \n\t" // temp2[3]
"movss (%8), %%xmm4 \n\t" // temp1[0]
"movss 4(%8), %%xmm5 \n\t" // temp1[1]
"movss 8(%8), %%xmm6 \n\t" // temp1[2]
"movss 12(%8), %%xmm7 \n\t" // temp1[3]
"shufps $0, %%xmm4, %%xmm4 \n\t"
"shufps $0, %%xmm5, %%xmm5 \n\t"
"shufps $0, %%xmm6, %%xmm6 \n\t"
"shufps $0, %%xmm7, %%xmm7 \n\t"
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm12 \n\t" // 4 * a
"movups (%5,%0,4), %%xmm13 \n\t" // 4 * a
"movups %%xmm12 , %%xmm11 \n\t"
"mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm12 \n\t" // a * x
"addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
"movups (%6,%0,4), %%xmm14 \n\t" // 4 * a
"movups (%7,%0,4), %%xmm15 \n\t" // 4 * a
"movups %%xmm13 , %%xmm11 \n\t"
"mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm13 \n\t" // a * x
"addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
"movups %%xmm14 , %%xmm11 \n\t"
"mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm14 \n\t" // a * x
"addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
"movups %%xmm15 , %%xmm11 \n\t"
"mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a
"addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
"mulps %%xmm8 , %%xmm15 \n\t" // a * x
"addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
"movups %%xmm9, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"haddps %%xmm0, %%xmm0 \n\t"
"haddps %%xmm1, %%xmm1 \n\t"
"haddps %%xmm2, %%xmm2 \n\t"
"haddps %%xmm3, %%xmm3 \n\t"
"haddps %%xmm0, %%xmm0 \n\t"
"haddps %%xmm1, %%xmm1 \n\t"
"haddps %%xmm2, %%xmm2 \n\t"
"haddps %%xmm3, %%xmm3 \n\t"
"movss %%xmm0 , (%9) \n\t" // save temp2
"movss %%xmm1 , 4(%9) \n\t" // save temp2
"movss %%xmm2 , 8(%9) \n\t" // save temp2
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
"r" (a1), // 5
"r" (a2), // 6
"r" (a3), // 7
"r" (temp1), // 8
"r" (temp2) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

131
kernel/x86_64/zaxpy.c Normal file
View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER)
#include "zaxpy_microk_bulldozer-2.c"
#endif
#ifndef HAVE_KERNEL_4
static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
BLASLONG register ix = 0;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
#endif
ix+=4 ;
i+=2 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT da[2];
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -4;
if ( n1 )
{
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_4(n1, x, y , &da );
ix = 2 * n1;
}
i = n1;
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
i++ ;
ix += 2;
}
return(0);
}
inc_x *=2;
inc_y *=2;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,135 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4 1
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vmovddup (%4), %%xmm0 \n\t" // real part of alpha
"vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 768(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x
"vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x
"vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x
"vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x
"prefetcht0 768(%3,%0,8) \n\t"
#if !defined(CONJ)
"vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t"
"vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t"
"vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t"
"vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t"
"vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t"
"vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t"
"vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t"
"vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t"
"vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t"
"vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t"
"vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t"
"vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t"
#else
"vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
"vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
"vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
"vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
"vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
"vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
"vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
"vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
"vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
"vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
"vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
"vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t"
"vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t"
"vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t"
"vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t"
"vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t"
#endif
"vmovups %%xmm12, (%3,%0,8) \n\t"
"vmovups %%xmm13, 16(%3,%0,8) \n\t"
"vmovups %%xmm14, 32(%3,%0,8) \n\t"
"vmovups %%xmm15, 48(%3,%0,8) \n\t"
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -222,8 +222,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 )
VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 )
addq $6*SIZE, BO
addq $8*SIZE, AO
addq $ 6*SIZE, BO
addq $ 8*SIZE, AO
decq %rax
.endm
@ -362,8 +362,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 )
addq $6*SIZE, BO
addq $4*SIZE, AO
addq $ 6*SIZE, BO
addq $ 4*SIZE, AO
decq %rax
.endm
@ -491,8 +491,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
addq $6*SIZE, BO
addq $2*SIZE, AO
addq $ 6*SIZE, BO
addq $ 2*SIZE, AO
decq %rax
.endm