Merge pull request #534 from wernsaar/develop

Refs #533. added optimized saxpy- and daxpy-kernel for haswell and sandybridge
This commit is contained in:
Zhang Xianyi 2015-04-07 12:48:11 -05:00
commit 4f680a7d61
12 changed files with 819 additions and 6 deletions

View File

@ -10,7 +10,7 @@ include $(TOPDIR)/Makefile.system
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
# ACML 6.1 custom # ACML 6.1 custom
ACML=/home/werner/project/acml6.1/gfortran64_mp/lib ACML=/home/saar/acml6.1/gfortran64_mp/lib
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
@ -40,7 +40,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto \ sger.goto dger.goto \
sdot.goto ddot.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
ssymv.goto dsymv.goto csymv.goto zsymv.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \
chemv.goto zhemv.goto \ chemv.goto zhemv.goto \
@ -61,7 +61,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
sger.acml dger.acml \ sger.acml dger.acml \
sdot.acml ddot.acml \ sdot.acml ddot.acml cdot.acml zdot.acml \
saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
ssymv.acml dsymv.acml csymv.acml zsymv.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \
chemv.acml zhemv.acml \ chemv.acml zhemv.acml \
@ -104,7 +104,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
sger.mkl dger.mkl \ sger.mkl dger.mkl \
sdot.mkl ddot.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \
saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
chemv.mkl zhemv.mkl \ chemv.mkl zhemv.mkl \
@ -998,6 +998,32 @@ ddot.atlas : ddot.$(SUFFIX)
ddot.mkl : ddot.$(SUFFIX) ddot.mkl : ddot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Cdot ####################################################
cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
cdot.acml : cdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cdot.atlas : cdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cdot.mkl : cdot-intel.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zdot ####################################################
zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
zdot.acml : zdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zdot.atlas : zdot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zdot.mkl : zdot-intel.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Saxpy #################################################### ##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
@ -1264,6 +1290,20 @@ sdot.$(SUFFIX) : dot.c
ddot.$(SUFFIX) : dot.c ddot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
cdot.$(SUFFIX) : zdot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zdot.$(SUFFIX) : zdot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
cdot-intel.$(SUFFIX) : zdot-intel.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zdot-intel.$(SUFFIX) : zdot-intel.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
saxpy.$(SUFFIX) : axpy.c saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

196
benchmark/zdot-intel.c Normal file
View File

@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#define RETURN_BY_STACK 1
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
DOT (&result, &m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/zdot.c Normal file
View File

@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -13,6 +13,8 @@ CGEMVTKERNEL = cgemv_t_4.c
SDOTKERNEL = sdot.c SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c DDOTKERNEL = ddot.c
DAXPYKERNEL = daxpy.c
SAXPYKERNEL = saxpy.c
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c

View File

@ -1,3 +1,7 @@
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SGEMVNKERNEL = sgemv_n_4.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c SGEMVTKERNEL = sgemv_t_4.c
@ -7,6 +11,7 @@ ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot_bulldozer.S DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S

View File

@ -6,6 +6,9 @@ ZGEMVNKERNEL = zgemv_n_4.c
SDOTKERNEL = sdot.c SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c DDOTKERNEL = ddot.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c

View File

@ -33,6 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_nehalem-2.c" #include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "daxpy_microk_bulldozer-2.c" #include "daxpy_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "daxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "daxpy_microk_sandy-2.c"
#endif #endif
@ -71,7 +75,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
int n1 = n & -8; #if defined(SANDYBRIDGE)
int n1 = n & -32;
#else
int n1 = n & -16;
#endif
if ( n1 ) if ( n1 )
daxpy_kernel_8(n1, x, y , &da ); daxpy_kernel_8(n1, x, y , &da );

View File

@ -0,0 +1,78 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha
".align 16 \n\t"
"1: \n\t"
"vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y
"vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y
"vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y
"vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y
"vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x
"vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x
"vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x
"vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x
"vmovups %%ymm12, (%3,%0,8) \n\t"
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
"vmovups %%ymm14, 64(%3,%0,8) \n\t"
"vmovups %%ymm15, 96(%3,%0,8) \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,100 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha
".align 16 \n\t"
"1: \n\t"
"vmovups (%3,%0,8), %%ymm8 \n\t"
"vmovups 32(%3,%0,8), %%ymm9 \n\t"
"vmovups 64(%3,%0,8), %%ymm10 \n\t"
"vmovups 96(%3,%0,8), %%ymm11 \n\t"
"vmovups 128(%3,%0,8), %%ymm12 \n\t"
"vmovups 160(%3,%0,8), %%ymm13 \n\t"
"vmovups 192(%3,%0,8), %%ymm14 \n\t"
"vmovups 224(%3,%0,8), %%ymm15 \n\t"
"vmulpd (%2,%0,8), %%ymm0, %%ymm1 \n\t"
"vmulpd 32(%2,%0,8), %%ymm0, %%ymm2 \n\t"
"vaddpd %%ymm8 , %%ymm1, %%ymm8 \n\t"
"vmulpd 64(%2,%0,8), %%ymm0, %%ymm3 \n\t"
"vaddpd %%ymm9 , %%ymm2, %%ymm9 \n\t"
"vmulpd 96(%2,%0,8), %%ymm0, %%ymm4 \n\t"
"vaddpd %%ymm10, %%ymm3, %%ymm10 \n\t"
"vmulpd 128(%2,%0,8), %%ymm0, %%ymm5 \n\t"
"vaddpd %%ymm11, %%ymm4, %%ymm11 \n\t"
"vmulpd 160(%2,%0,8), %%ymm0, %%ymm6 \n\t"
"vaddpd %%ymm12, %%ymm5, %%ymm12 \n\t"
"vmulpd 192(%2,%0,8), %%ymm0, %%ymm7 \n\t"
"vmulpd 224(%2,%0,8), %%ymm0, %%ymm1 \n\t"
"vaddpd %%ymm13, %%ymm6, %%ymm13 \n\t"
"vmovups %%ymm8 , (%3,%0,8) \n\t"
"vaddpd %%ymm14, %%ymm7, %%ymm14 \n\t"
"vmovups %%ymm9 , 32(%3,%0,8) \n\t"
"vaddpd %%ymm15, %%ymm1, %%ymm15 \n\t"
"vmovups %%ymm10, 64(%3,%0,8) \n\t"
"vmovups %%ymm11, 96(%3,%0,8) \n\t"
"vmovups %%ymm12,128(%3,%0,8) \n\t"
"vmovups %%ymm13,160(%3,%0,8) \n\t"
"vmovups %%ymm14,192(%3,%0,8) \n\t"
"vmovups %%ymm15,224(%3,%0,8) \n\t"
"addq $32, %0 \n\t"
"subq $32, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -31,6 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM) #if defined(NEHALEM)
#include "saxpy_microk_nehalem-2.c" #include "saxpy_microk_nehalem-2.c"
#elif defined(HASWELL)
#include "saxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c"
#endif #endif
@ -69,7 +73,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
int n1 = n & -16; #if defined(SANDYBRIDGE)
int n1 = n & -64;
#else
int n1 = n & -32;
#endif
if ( n1 ) if ( n1 )
saxpy_kernel_16(n1, x, y , &da ); saxpy_kernel_16(n1, x, y , &da );

View File

@ -0,0 +1,78 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%4), %%ymm0 \n\t" // alpha
".align 16 \n\t"
"1: \n\t"
"vmovups (%3,%0,4), %%ymm12 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm13 \n\t" // 8 * y
"vmovups 64(%3,%0,4), %%ymm14 \n\t" // 8 * y
"vmovups 96(%3,%0,4), %%ymm15 \n\t" // 8 * y
"vfmadd231ps (%2,%0,4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x
"vfmadd231ps 32(%2,%0,4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x
"vfmadd231ps 64(%2,%0,4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x
"vfmadd231ps 96(%2,%0,4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x
"vmovups %%ymm12, (%3,%0,4) \n\t"
"vmovups %%ymm13, 32(%3,%0,4) \n\t"
"vmovups %%ymm14, 64(%3,%0,4) \n\t"
"vmovups %%ymm15, 96(%3,%0,4) \n\t"
"addq $32, %0 \n\t"
"subq $32, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,100 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%4), %%ymm0 \n\t" // alpha
".align 16 \n\t"
"1: \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t"
"vmovups 32(%3,%0,4), %%ymm9 \n\t"
"vmovups 64(%3,%0,4), %%ymm10 \n\t"
"vmovups 96(%3,%0,4), %%ymm11 \n\t"
"vmovups 128(%3,%0,4), %%ymm12 \n\t"
"vmovups 160(%3,%0,4), %%ymm13 \n\t"
"vmovups 192(%3,%0,4), %%ymm14 \n\t"
"vmovups 224(%3,%0,4), %%ymm15 \n\t"
"vmulps (%2,%0,4), %%ymm0, %%ymm1 \n\t"
"vmulps 32(%2,%0,4), %%ymm0, %%ymm2 \n\t"
"vaddps %%ymm8 , %%ymm1, %%ymm8 \n\t"
"vmulps 64(%2,%0,4), %%ymm0, %%ymm3 \n\t"
"vaddps %%ymm9 , %%ymm2, %%ymm9 \n\t"
"vmulps 96(%2,%0,4), %%ymm0, %%ymm4 \n\t"
"vaddps %%ymm10, %%ymm3, %%ymm10 \n\t"
"vmulps 128(%2,%0,4), %%ymm0, %%ymm5 \n\t"
"vaddps %%ymm11, %%ymm4, %%ymm11 \n\t"
"vmulps 160(%2,%0,4), %%ymm0, %%ymm6 \n\t"
"vaddps %%ymm12, %%ymm5, %%ymm12 \n\t"
"vmulps 192(%2,%0,4), %%ymm0, %%ymm7 \n\t"
"vmulps 224(%2,%0,4), %%ymm0, %%ymm1 \n\t"
"vaddps %%ymm13, %%ymm6, %%ymm13 \n\t"
"vmovups %%ymm8 , (%3,%0,4) \n\t"
"vaddps %%ymm14, %%ymm7, %%ymm14 \n\t"
"vmovups %%ymm9 , 32(%3,%0,4) \n\t"
"vaddps %%ymm15, %%ymm1, %%ymm15 \n\t"
"vmovups %%ymm10, 64(%3,%0,4) \n\t"
"vmovups %%ymm11, 96(%3,%0,4) \n\t"
"vmovups %%ymm12,128(%3,%0,4) \n\t"
"vmovups %%ymm13,160(%3,%0,4) \n\t"
"vmovups %%ymm14,192(%3,%0,4) \n\t"
"vmovups %%ymm15,224(%3,%0,4) \n\t"
"addq $64, %0 \n\t"
"subq $64, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}