Merge pull request #549 from wernsaar/develop
added optimized dsymv kernels for haswell and sandybridge
This commit is contained in:
commit
406d9d64e9
4
Makefile
4
Makefile
|
@ -20,6 +20,8 @@ ifneq ($(NO_LAPACK), 1)
|
||||||
SUBDIRS += lapack
|
SUBDIRS += lapack
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||||
|
|
||||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||||
|
|
||||||
.PHONY : all libs netlib test ctest shared install
|
.PHONY : all libs netlib test ctest shared install
|
||||||
|
@ -231,7 +233,7 @@ ifndef NOFORTRAN
|
||||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
|
|
@ -42,6 +42,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
sger.goto dger.goto \
|
sger.goto dger.goto \
|
||||||
sdot.goto ddot.goto cdot.goto zdot.goto \
|
sdot.goto ddot.goto cdot.goto zdot.goto \
|
||||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||||
|
sscal.goto dscal.goto cscal.goto zscal.goto \
|
||||||
|
sasum.goto dasum.goto casum.goto zasum.goto \
|
||||||
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
|
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
|
||||||
chemv.goto zhemv.goto \
|
chemv.goto zhemv.goto \
|
||||||
chemm.goto zhemm.goto \
|
chemm.goto zhemm.goto \
|
||||||
|
@ -63,6 +65,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||||
sger.acml dger.acml \
|
sger.acml dger.acml \
|
||||||
sdot.acml ddot.acml cdot.acml zdot.acml \
|
sdot.acml ddot.acml cdot.acml zdot.acml \
|
||||||
saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
|
saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
|
||||||
|
sscal.acml dscal.acml cscal.acml zscal.acml \
|
||||||
|
sasum.acml dasum.acml casum.acml zasum.acml \
|
||||||
ssymv.acml dsymv.acml csymv.acml zsymv.acml \
|
ssymv.acml dsymv.acml csymv.acml zsymv.acml \
|
||||||
chemv.acml zhemv.acml \
|
chemv.acml zhemv.acml \
|
||||||
chemm.acml zhemm.acml \
|
chemm.acml zhemm.acml \
|
||||||
|
@ -84,6 +88,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
||||||
sger.atlas dger.atlas \
|
sger.atlas dger.atlas \
|
||||||
sdot.atlas ddot.atlas \
|
sdot.atlas ddot.atlas \
|
||||||
saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
|
saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
|
||||||
|
sscal.atlas dscal.atlas cscal.atlas zscal.atlas \
|
||||||
|
sasum.atlas dasum.atlas casum.atlas zasum.atlas \
|
||||||
ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
|
ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
|
||||||
chemv.atlas zhemv.atlas \
|
chemv.atlas zhemv.atlas \
|
||||||
chemm.acml zhemm.acml \
|
chemm.acml zhemm.acml \
|
||||||
|
@ -106,6 +112,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||||
sger.mkl dger.mkl \
|
sger.mkl dger.mkl \
|
||||||
sdot.mkl ddot.mkl cdot.mkl zdot.mkl \
|
sdot.mkl ddot.mkl cdot.mkl zdot.mkl \
|
||||||
saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
|
saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
|
||||||
|
sscal.mkl dscal.mkl cscal.mkl zscal.mkl \
|
||||||
|
sasum.mkl dasum.mkl casum.mkl zasum.mkl \
|
||||||
ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
|
ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
|
||||||
chemv.mkl zhemv.mkl \
|
chemv.mkl zhemv.mkl \
|
||||||
chemm.mkl zhemm.mkl \
|
chemm.mkl zhemm.mkl \
|
||||||
|
@ -1078,6 +1086,116 @@ zaxpy.atlas : zaxpy.$(SUFFIX)
|
||||||
zaxpy.mkl : zaxpy.$(SUFFIX)
|
zaxpy.mkl : zaxpy.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Sscal ####################################################
|
||||||
|
sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
sscal.acml : sscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
sscal.atlas : sscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
sscal.mkl : sscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Dscal ####################################################
|
||||||
|
dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
dscal.acml : dscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dscal.atlas : dscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dscal.mkl : dscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Cscal ####################################################
|
||||||
|
|
||||||
|
cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
cscal.acml : cscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cscal.atlas : cscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cscal.mkl : cscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Zscal ####################################################
|
||||||
|
|
||||||
|
zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
zscal.acml : zscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zscal.atlas : zscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zscal.mkl : zscal.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Sasum ####################################################
|
||||||
|
sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
sasum.acml : sasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
sasum.atlas : sasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
sasum.mkl : sasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Dasum ####################################################
|
||||||
|
dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
dasum.acml : dasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dasum.atlas : dasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dasum.mkl : dasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Casum ####################################################
|
||||||
|
|
||||||
|
casum.goto : casum.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
casum.acml : casum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
casum.atlas : casum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
casum.mkl : casum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Zasum ####################################################
|
||||||
|
|
||||||
|
zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
|
||||||
|
|
||||||
|
zasum.acml : zasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zasum.atlas : zasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zasum.mkl : zasum.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##################################### Cgemm3m ####################################################
|
##################################### Cgemm3m ####################################################
|
||||||
|
|
||||||
|
@ -1316,6 +1434,33 @@ caxpy.$(SUFFIX) : axpy.c
|
||||||
zaxpy.$(SUFFIX) : axpy.c
|
zaxpy.$(SUFFIX) : axpy.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
sscal.$(SUFFIX) : scal.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
dscal.$(SUFFIX) : scal.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
cscal.$(SUFFIX) : scal.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
zscal.$(SUFFIX) : scal.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
sasum.$(SUFFIX) : asum.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
dasum.$(SUFFIX) : asum.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
casum.$(SUFFIX) : asum.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
zasum.$(SUFFIX) : asum.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cgemm3m.$(SUFFIX) : gemm3m.c
|
cgemm3m.$(SUFFIX) : gemm3m.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#undef ASUM
|
||||||
|
|
||||||
|
#ifdef COMPLEX
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define ASUM BLASFUNC(dzasum)
|
||||||
|
#else
|
||||||
|
#define ASUM BLASFUNC(scasum)
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define ASUM BLASFUNC(dasum)
|
||||||
|
#else
|
||||||
|
#define ASUM BLASFUNC(sasum)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__WIN64__)
|
||||||
|
|
||||||
|
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||||
|
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int gettimeofday(struct timeval *tv, void *tz){
|
||||||
|
|
||||||
|
FILETIME ft;
|
||||||
|
unsigned __int64 tmpres = 0;
|
||||||
|
static int tzflag;
|
||||||
|
|
||||||
|
if (NULL != tv)
|
||||||
|
{
|
||||||
|
GetSystemTimeAsFileTime(&ft);
|
||||||
|
|
||||||
|
tmpres |= ft.dwHighDateTime;
|
||||||
|
tmpres <<= 32;
|
||||||
|
tmpres |= ft.dwLowDateTime;
|
||||||
|
|
||||||
|
/*converting file time to unix epoch*/
|
||||||
|
tmpres /= 10; /*convert into microseconds*/
|
||||||
|
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||||
|
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||||
|
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size){
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1){
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
|
FLOAT *x;
|
||||||
|
FLOAT result;
|
||||||
|
blasint m, i;
|
||||||
|
blasint inc_x=1;
|
||||||
|
int loops = 1;
|
||||||
|
int l;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timeval start, stop;
|
||||||
|
double time1,timeg;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(m = from; m <= to; m += step)
|
||||||
|
{
|
||||||
|
|
||||||
|
timeg=0;
|
||||||
|
|
||||||
|
fprintf(stderr, " %6d : ", (int)m);
|
||||||
|
|
||||||
|
|
||||||
|
for (l=0; l<loops; l++)
|
||||||
|
{
|
||||||
|
|
||||||
|
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||||
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
result = ASUM (&m, x, &inc_x);
|
||||||
|
|
||||||
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||||
|
|
||||||
|
timeg += time1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
timeg /= loops;
|
||||||
|
|
||||||
|
#ifdef COMPLEX
|
||||||
|
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -0,0 +1,202 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#undef SCAL
|
||||||
|
|
||||||
|
#ifdef COMPLEX
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define SCAL BLASFUNC(zscal)
|
||||||
|
#else
|
||||||
|
#define SCAL BLASFUNC(cscal)
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define SCAL BLASFUNC(dscal)
|
||||||
|
#else
|
||||||
|
#define SCAL BLASFUNC(sscal)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__WIN64__)
|
||||||
|
|
||||||
|
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||||
|
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int gettimeofday(struct timeval *tv, void *tz){
|
||||||
|
|
||||||
|
FILETIME ft;
|
||||||
|
unsigned __int64 tmpres = 0;
|
||||||
|
static int tzflag;
|
||||||
|
|
||||||
|
if (NULL != tv)
|
||||||
|
{
|
||||||
|
GetSystemTimeAsFileTime(&ft);
|
||||||
|
|
||||||
|
tmpres |= ft.dwHighDateTime;
|
||||||
|
tmpres <<= 32;
|
||||||
|
tmpres |= ft.dwLowDateTime;
|
||||||
|
|
||||||
|
/*converting file time to unix epoch*/
|
||||||
|
tmpres /= 10; /*convert into microseconds*/
|
||||||
|
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||||
|
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||||
|
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size){
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1){
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
|
FLOAT *x, *y;
|
||||||
|
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||||
|
blasint m, i;
|
||||||
|
blasint inc_x=1,inc_y=1;
|
||||||
|
int loops = 1;
|
||||||
|
int l;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timeval start, stop;
|
||||||
|
double time1,timeg;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(m = from; m <= to; m += step)
|
||||||
|
{
|
||||||
|
|
||||||
|
timeg=0;
|
||||||
|
|
||||||
|
fprintf(stderr, " %6d : ", (int)m);
|
||||||
|
|
||||||
|
|
||||||
|
for (l=0; l<loops; l++)
|
||||||
|
{
|
||||||
|
|
||||||
|
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||||
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||||
|
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
SCAL (&m, alpha, x, &inc_x);
|
||||||
|
|
||||||
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||||
|
|
||||||
|
timeg += time1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
timeg /= loops;
|
||||||
|
|
||||||
|
#ifdef COMPLEX
|
||||||
|
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -71,8 +71,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifndef COMMON_ARM
|
#ifndef COMMON_ARM
|
||||||
#define COMMON_ARM
|
#define COMMON_ARM
|
||||||
|
|
||||||
#define MB
|
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||||
#define WMB
|
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||||
|
|
||||||
#define INLINE inline
|
#define INLINE inline
|
||||||
|
|
||||||
|
@ -88,9 +88,12 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
while (*address) {YIELDING;};
|
while (*address) {YIELDING;};
|
||||||
|
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
|
"1: \n\t"
|
||||||
"ldrex r2, [%1] \n\t"
|
"ldrex r2, [%1] \n\t"
|
||||||
"mov r2, #0 \n\t"
|
"mov r2, #0 \n\t"
|
||||||
"strex r3, r2, [%1] \n\t"
|
"strex r3, r2, [%1] \n\t"
|
||||||
|
"cmp r3, #0 \n\t"
|
||||||
|
"bne 1b \n\t"
|
||||||
"mov %0 , r3 \n\t"
|
"mov %0 , r3 \n\t"
|
||||||
: "=r"(ret), "=r"(address)
|
: "=r"(ret), "=r"(address)
|
||||||
: "1"(address)
|
: "1"(address)
|
||||||
|
|
|
@ -10,6 +10,9 @@ ZGEMVTKERNEL = zgemv_t_4.c
|
||||||
CGEMVNKERNEL = cgemv_n_4.c
|
CGEMVNKERNEL = cgemv_n_4.c
|
||||||
CGEMVTKERNEL = cgemv_t_4.c
|
CGEMVTKERNEL = cgemv_t_4.c
|
||||||
|
|
||||||
|
DSYMV_L_KERNEL = dsymv_L.c
|
||||||
|
DSYMV_U_KERNEL = dsymv_U.c
|
||||||
|
|
||||||
SDOTKERNEL = sdot.c
|
SDOTKERNEL = sdot.c
|
||||||
DDOTKERNEL = ddot.c
|
DDOTKERNEL = ddot.c
|
||||||
CDOTKERNEL = cdot.c
|
CDOTKERNEL = cdot.c
|
||||||
|
|
|
@ -8,6 +8,9 @@ DDOTKERNEL = ddot.c
|
||||||
CDOTKERNEL = cdot.c
|
CDOTKERNEL = cdot.c
|
||||||
ZDOTKERNEL = zdot.c
|
ZDOTKERNEL = zdot.c
|
||||||
|
|
||||||
|
DSYMV_L_KERNEL = dsymv_L.c
|
||||||
|
DSYMV_U_KERNEL = dsymv_U.c
|
||||||
|
|
||||||
|
|
||||||
SAXPYKERNEL = saxpy.c
|
SAXPYKERNEL = saxpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
|
|
|
@ -30,6 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "dsymv_L_microk_bulldozer-2.c"
|
#include "dsymv_L_microk_bulldozer-2.c"
|
||||||
|
#elif defined(HASWELL)
|
||||||
|
#include "dsymv_L_microk_haswell-2.c"
|
||||||
|
#elif defined(SANDYBRIDGE)
|
||||||
|
#include "dsymv_L_microk_sandy-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "dsymv_L_microk_nehalem-2.c"
|
#include "dsymv_L_microk_nehalem-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0]
|
||||||
|
"vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1]
|
||||||
|
"vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2]
|
||||||
|
"vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3]
|
||||||
|
"vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0]
|
||||||
|
"vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a
|
||||||
|
"vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a
|
||||||
|
"vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a
|
||||||
|
"vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a
|
||||||
|
"addq $4 , %0 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%ymm9 , -32(%3,%0,8) \n\t"
|
||||||
|
|
||||||
|
"cmpq %0 , %1 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vmovsd (%9), %%xmm4 \n\t"
|
||||||
|
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||||
|
"vmovsd 16(%9), %%xmm6 \n\t"
|
||||||
|
"vmovsd 24(%9), %%xmm7 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (from), // 0
|
||||||
|
"r" (to), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (a[0]), // 4
|
||||||
|
"r" (a[1]), // 5
|
||||||
|
"r" (a[2]), // 6
|
||||||
|
"r" (a[3]), // 8
|
||||||
|
"r" (temp1), // 8
|
||||||
|
"r" (temp2) // 9
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,138 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0]
|
||||||
|
"vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1]
|
||||||
|
"vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2]
|
||||||
|
"vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3]
|
||||||
|
"vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0]
|
||||||
|
"vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a
|
||||||
|
"vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a
|
||||||
|
"vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a
|
||||||
|
"vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a
|
||||||
|
|
||||||
|
"vmulpd %%ymm4, %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm12, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm0, %%ymm11, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vmulpd %%ymm5, %%ymm13, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm13, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm1, %%ymm11, %%ymm1 \n\t"
|
||||||
|
|
||||||
|
"vmulpd %%ymm6, %%ymm14, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm14, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm2, %%ymm11, %%ymm2 \n\t"
|
||||||
|
|
||||||
|
"vmulpd %%ymm7, %%ymm15, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm3, %%ymm11, %%ymm3 \n\t"
|
||||||
|
|
||||||
|
"addq $4 , %0 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%ymm9 , -32(%3,%0,8) \n\t"
|
||||||
|
|
||||||
|
"cmpq %0 , %1 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vmovsd (%9), %%xmm4 \n\t"
|
||||||
|
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||||
|
"vmovsd 16(%9), %%xmm6 \n\t"
|
||||||
|
"vmovsd 24(%9), %%xmm7 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (from), // 0
|
||||||
|
"r" (to), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (a[0]), // 4
|
||||||
|
"r" (a[1]), // 5
|
||||||
|
"r" (a[2]), // 6
|
||||||
|
"r" (a[3]), // 8
|
||||||
|
"r" (temp1), // 8
|
||||||
|
"r" (temp2) // 9
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "dsymv_U_microk_bulldozer-2.c"
|
#include "dsymv_U_microk_bulldozer-2.c"
|
||||||
|
#elif defined(HASWELL)
|
||||||
|
#include "dsymv_U_microk_haswell-2.c"
|
||||||
|
#elif defined(SANDYBRIDGE)
|
||||||
|
#include "dsymv_U_microk_sandy-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "dsymv_U_microk_nehalem-2.c"
|
#include "dsymv_U_microk_nehalem-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,131 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||||
|
{
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0]
|
||||||
|
"vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1]
|
||||||
|
"vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2]
|
||||||
|
"vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3]
|
||||||
|
"vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0]
|
||||||
|
"vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
|
||||||
|
"xorq %0,%0 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a
|
||||||
|
"vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a
|
||||||
|
"vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a
|
||||||
|
"vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a
|
||||||
|
"vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a
|
||||||
|
"addq $4 , %0 \n\t"
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%ymm9 , -32(%3,%0,8) \n\t"
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vmovsd (%9), %%xmm4 \n\t"
|
||||||
|
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||||
|
"vmovsd 16(%9), %%xmm6 \n\t"
|
||||||
|
"vmovsd 24(%9), %%xmm7 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (a0), // 4
|
||||||
|
"r" (a1), // 5
|
||||||
|
"r" (a2), // 6
|
||||||
|
"r" (a3), // 8
|
||||||
|
"r" (temp1), // 8
|
||||||
|
"r" (temp2) // 9
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,140 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||||
|
{
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0]
|
||||||
|
"vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1]
|
||||||
|
"vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2]
|
||||||
|
"vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3]
|
||||||
|
"vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0]
|
||||||
|
"vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
|
||||||
|
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
|
||||||
|
"xorq %0,%0 \n\t"
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
|
||||||
|
"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
|
||||||
|
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x
|
||||||
|
|
||||||
|
"vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a
|
||||||
|
"vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a
|
||||||
|
"vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a
|
||||||
|
"vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a
|
||||||
|
|
||||||
|
"vmulpd %%ymm4, %%ymm12, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm12, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm0, %%ymm11, %%ymm0 \n\t"
|
||||||
|
|
||||||
|
"vmulpd %%ymm5, %%ymm13, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm13, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm1, %%ymm11, %%ymm1 \n\t"
|
||||||
|
|
||||||
|
"vmulpd %%ymm6, %%ymm14, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm14, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm2, %%ymm11, %%ymm2 \n\t"
|
||||||
|
|
||||||
|
"vmulpd %%ymm7, %%ymm15, %%ymm10 \n\t"
|
||||||
|
"vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t"
|
||||||
|
"vmulpd %%ymm8, %%ymm15, %%ymm11 \n\t"
|
||||||
|
"vaddpd %%ymm3, %%ymm11, %%ymm3 \n\t"
|
||||||
|
|
||||||
|
"addq $4 , %0 \n\t"
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
|
"vmovups %%ymm9 , -32(%3,%0,8) \n\t"
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"vmovsd (%9), %%xmm4 \n\t"
|
||||||
|
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||||
|
"vmovsd 16(%9), %%xmm6 \n\t"
|
||||||
|
"vmovsd 24(%9), %%xmm7 \n\t"
|
||||||
|
|
||||||
|
"vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t"
|
||||||
|
"vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t"
|
||||||
|
"vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t"
|
||||||
|
"vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t"
|
||||||
|
"vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
|
||||||
|
"vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
|
||||||
|
"vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
|
||||||
|
"vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
|
||||||
|
|
||||||
|
"vmovsd %%xmm0 , (%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
|
||||||
|
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (a0), // 4
|
||||||
|
"r" (a1), // 5
|
||||||
|
"r" (a2), // 6
|
||||||
|
"r" (a3), // 8
|
||||||
|
"r" (temp1), // 8
|
||||||
|
"r" (temp2) // 9
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -169,4 +169,4 @@ cchkee.o: cchkee.f
|
||||||
zchkee.o: zchkee.f
|
zchkee.o: zchkee.f
|
||||||
$(FORTRAN) $(DRVOPTS) -c $< -o $@
|
$(FORTRAN) $(DRVOPTS) -c $< -o $@
|
||||||
|
|
||||||
.f.o : ; $(FORTRAN) $(OPTS) -c $< -o $@
|
.f.o : ; $(FORTRAN) $(DRVOPTS) -c $< -o $@
|
||||||
|
|
|
@ -338,4 +338,4 @@ zchkaa.o: zchkaa.f
|
||||||
$(FORTRAN) $(DRVOPTS) -c $< -o $@
|
$(FORTRAN) $(DRVOPTS) -c $< -o $@
|
||||||
|
|
||||||
.f.o:
|
.f.o:
|
||||||
$(FORTRAN) $(OPTS) -c $< -o $@
|
$(FORTRAN) $(DRVOPTS) -c $< -o $@
|
||||||
|
|
Loading…
Reference in New Issue