Merge pull request #445 from wernsaar/develop

A lot of optimizations for gemv kernels
This commit is contained in:
Zhang Xianyi 2014-09-10 16:28:14 +08:00
commit d13e92f07e
34 changed files with 4871 additions and 2190 deletions

View File

@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
blasint inc_x=1,inc_y=1; blasint inc_x=1,inc_y=1;
blasint n=0; blasint n=0;
int has_param_n = 0; int has_param_n = 0;
int has_param_m = 0;
int loops = 1; int loops = 1;
int l; int l;
char *p; char *p;
@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;}
int tomax = to;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_PARAM_N"))) { if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p); n = atoi(p);
if ((n>0) && (n<=to)) has_param_n = 1; if ((n>0)) has_param_n = 1;
if ( n > tomax ) tomax = n;
} }
if ( has_param_n == 0 )
if ((p = getenv("OPENBLAS_PARAM_M"))) {
m = atoi(p);
if ((m>0)) has_param_m = 1;
if ( m > tomax ) tomax = m;
}
if ( has_param_n == 1 )
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
else
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);
} }
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);
} }
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1); fprintf(stderr,"Out of Memory!!\n");exit(1);
} }
@ -177,50 +187,80 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, " SIZE Flops\n"); fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) if (has_param_m == 0)
{ {
timeg=0; for(m = from; m <= to; m += step)
{
timeg=0;
if ( has_param_n == 0 ) n = m;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
if ( has_param_n == 0 ) n = m; for (l=0; l<loops; l++)
{
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
for(i = 0; i < n * COMPSIZE; i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; }
} gettimeofday( &start, (struct timezone *)0);
} GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
for (l=0; l<loops; l++) timeg /= loops;
{
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ }
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; }
} else
gettimeofday( &start, (struct timezone *)0); {
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); for(n = from; n <= to; n += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &stop, (struct timezone *)0); for (l=0; l<loops; l++)
{
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
timeg += time1; for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
} }
timeg /= loops; timeg /= loops;
fprintf(stderr, fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
}
} }
return 0; return 0;

42
benchmark/tplot-header Normal file
View File

@ -0,0 +1,42 @@
# **********************************************************************************
# Copyright (c) 2014, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# **********************************************************************************
set term x11 font sans;
set ylabel "MFlops";
set xlabel "Size";
set grid xtics;
set grid ytics;
set key left;
set timestamp "generated on %Y-%m-%d by `whoami`"
set title "Sgemv\nTRANS=T\nBulldozer"
plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
set output "print.png";
show title;
show plot;
show output;

View File

@ -46,6 +46,7 @@
#define __volatile__ #define __volatile__
#endif #endif
/*
#ifdef HAVE_SSE2 #ifdef HAVE_SSE2
#define MB __asm__ __volatile__ ("mfence"); #define MB __asm__ __volatile__ ("mfence");
#define WMB __asm__ __volatile__ ("sfence"); #define WMB __asm__ __volatile__ ("sfence");
@ -53,6 +54,10 @@
#define MB #define MB
#define WMB #define WMB
#endif #endif
*/
#define MB
#define WMB
static void __inline blas_lock(volatile BLASULONG *address){ static void __inline blas_lock(volatile BLASULONG *address){
@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
: "0" (op)); : "0" (op));
} }
/*
#define WHEREAMI #define WHEREAMI
*/
static inline int WhereAmI(void){ static inline int WhereAmI(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
@ -111,6 +118,7 @@ static inline int WhereAmI(void){
return apicid; return apicid;
} }
#ifdef CORE_BARCELONA #ifdef CORE_BARCELONA
#define IFLUSH gotoblas_iflush() #define IFLUSH gotoblas_iflush()
#define IFLUSH_HALF gotoblas_iflush_half() #define IFLUSH_HALF gotoblas_iflush_half()

View File

@ -251,7 +251,11 @@ void blas_set_parameter(void){
env_var_t p; env_var_t p;
int factor; int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
int size = 16;
#else
int size = get_L2_size(); int size = get_L2_size();
#endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
size >>= 7; size >>= 7;

View File

@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
int nthreads_avail = nthreads_max; int nthreads_avail = nthreads_max;
double MNK = (double) m * (double) n; double MNK = (double) m * (double) n;
if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
nthreads_max = 1; nthreads_max = 1;
if ( nthreads_max > nthreads_avail ) if ( nthreads_max > nthreads_avail )

View File

@ -10,8 +10,8 @@ DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c

View File

@ -1,8 +1,8 @@
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n.c DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t.c DGEMVTKERNEL = dgemv_t_4.c
ZGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c
ZGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c

View File

@ -9,9 +9,9 @@ DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n.c DGEMVNKERNEL = dgemv_n_4.c
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S SGEMMINCOPY = gemm_ncopy_4.S

View File

@ -1,5 +1,5 @@
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S

View File

@ -1,5 +1,5 @@
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t.c SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c

548
kernel/x86_64/dgemv_n_4.c Normal file
View File

@ -0,0 +1,548 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL)
#include "dgemv_n_microk_haswell-4.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_4x8
static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT *b0,*b1,*b2,*b3;
FLOAT *x4;
FLOAT x[8];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x4 = x + 4;
for ( i=0; i<8; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x4
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x2
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2) , %%xmm12 \n\t" // x0
"movsd (%6) , %%xmm4 \n\t" // alpha
"movsd 8(%2) , %%xmm13 \n\t" // x1
"mulsd %%xmm4 , %%xmm12 \n\t" // alpha
"mulsd %%xmm4 , %%xmm13 \n\t" // alpha
"shufpd $0, %%xmm12, %%xmm12 \n\t"
"shufpd $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
"movups (%4,%0,8), %%xmm8 \n\t"
"movups (%5,%0,8), %%xmm9 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm4 \n\t"
"movups 16(%4,%0,8), %%xmm8 \n\t"
"movups 16(%5,%0,8), %%xmm9 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"addpd %%xmm8 , %%xmm5 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
"movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
#ifndef HAVE_KERNEL_4x2
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2), %%xmm12 \n\t" // x0
"mulsd (%5), %%xmm12 \n\t" // alpha
"shufpd $0, %%xmm12, %%xmm12 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm12, %%xmm9 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
"movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
"r" (alpha) // 5
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;
}
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if ( n2 & 4 )
{
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $8 , %8 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L8END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L8END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,265 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2), %%xmm12 \n\t" // x0
"movsd 8(%2), %%xmm13 \n\t" // x1
"movsd 16(%2), %%xmm14 \n\t" // x2
"movsd 24(%2), %%xmm15 \n\t" // x3
"shufpd $0, %%xmm12, %%xmm12\n\t"
"shufpd $0, %%xmm13, %%xmm13\n\t"
"shufpd $0, %%xmm14, %%xmm14\n\t"
"shufpd $0, %%xmm15, %%xmm15\n\t"
"movsd 32(%2), %%xmm0 \n\t" // x4
"movsd 40(%2), %%xmm1 \n\t" // x5
"movsd 48(%2), %%xmm2 \n\t" // x6
"movsd 56(%2), %%xmm3 \n\t" // x7
"shufpd $0, %%xmm0 , %%xmm0 \n\t"
"shufpd $0, %%xmm1 , %%xmm1 \n\t"
"shufpd $0, %%xmm2 , %%xmm2 \n\t"
"shufpd $0, %%xmm3 , %%xmm3 \n\t"
"movsd (%9), %%xmm6 \n\t" // alpha
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
".align 2 \n\t"
"movups (%4,%0,8), %%xmm8 \n\t"
"movups (%5,%0,8), %%xmm9 \n\t"
"movups (%6,%0,8), %%xmm10 \n\t"
"movups (%7,%0,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,8), %%xmm8 \n\t"
"movups (%5,%8,8), %%xmm9 \n\t"
"movups (%6,%8,8), %%xmm10 \n\t"
"movups (%7,%8,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm0 , %%xmm8 \n\t"
"mulpd %%xmm1 , %%xmm9 \n\t"
"mulpd %%xmm2 , %%xmm10 \n\t"
"mulpd %%xmm3 , %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"addpd %%xmm5 , %%xmm4 \n\t"
"mulpd %%xmm6 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
".align 2 \n\t"
"movups 16(%4,%0,8), %%xmm8 \n\t"
"movups 16(%5,%0,8), %%xmm9 \n\t"
"movups 16(%6,%0,8), %%xmm10 \n\t"
"movups 16(%7,%0,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"movups 16(%4,%8,8), %%xmm8 \n\t"
"movups 16(%5,%8,8), %%xmm9 \n\t"
"movups 16(%6,%8,8), %%xmm10 \n\t"
"movups 16(%7,%8,8), %%xmm11 \n\t"
".align 2 \n\t"
"mulpd %%xmm0 , %%xmm8 \n\t"
"mulpd %%xmm1 , %%xmm9 \n\t"
"mulpd %%xmm2 , %%xmm10 \n\t"
"mulpd %%xmm3 , %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm5 \n\t"
"addpd %%xmm10, %%xmm4 \n\t"
"addpd %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"addpd %%xmm5 , %%xmm4 \n\t"
"mulpd %%xmm6 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movsd (%2), %%xmm12 \n\t" // x0
"movsd 8(%2), %%xmm13 \n\t" // x1
"movsd 16(%2), %%xmm14 \n\t" // x2
"movsd 24(%2), %%xmm15 \n\t" // x3
"shufpd $0, %%xmm12, %%xmm12\n\t"
"shufpd $0, %%xmm13, %%xmm13\n\t"
"shufpd $0, %%xmm14, %%xmm14\n\t"
"shufpd $0, %%xmm15, %%xmm15\n\t"
"movsd (%8), %%xmm6 \n\t" // alpha
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
"movups (%4,%0,8), %%xmm8 \n\t"
"movups (%5,%0,8), %%xmm9 \n\t"
"movups (%6,%0,8), %%xmm10 \n\t"
"movups (%7,%0,8), %%xmm11 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm4 \n\t"
"addpd %%xmm10 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t"
"addpd %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, (%3,%0,8) \n\t" // 2 * y
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
"movups 16(%4,%0,8), %%xmm8 \n\t"
"movups 16(%5,%0,8), %%xmm9 \n\t"
"movups 16(%6,%0,8), %%xmm10 \n\t"
"movups 16(%7,%0,8), %%xmm11 \n\t"
"mulpd %%xmm12, %%xmm8 \n\t"
"mulpd %%xmm13, %%xmm9 \n\t"
"mulpd %%xmm14, %%xmm10 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t"
"addpd %%xmm8 , %%xmm4 \n\t"
"addpd %%xmm9 , %%xmm4 \n\t"
"addpd %%xmm10 , %%xmm4 \n\t"
"addpd %%xmm4 , %%xmm11 \n\t"
"mulpd %%xmm6 , %%xmm11 \n\t"
"addpd %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

615
kernel/x86_64/dgemv_t_4.c Normal file
View File

@ -0,0 +1,615 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(HASWELL)
#include "dgemv_t_microk_haswell-4.c"
#endif
#define NBMAX 2048
#ifndef HAVE_KERNEL_4x4
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorpd %%xmm10 , %%xmm10 \n\t"
"xorpd %%xmm11 , %%xmm11 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
"movups (%4,%0,8) , %%xmm13 \n\t" // ap1
"mulpd %%xmm14 , %%xmm12 \n\t"
"mulpd %%xmm14 , %%xmm13 \n\t"
"addq $2 , %0 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
"movups (%4,%0,8) , %%xmm13 \n\t" // ap1
"mulpd %%xmm14 , %%xmm12 \n\t"
"mulpd %%xmm14 , %%xmm13 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
"movups 16(%5,%0,8) , %%xmm14 \n\t" // x
"movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0
"movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1
"mulpd %%xmm14 , %%xmm12 \n\t"
"mulpd %%xmm14 , %%xmm13 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
"haddpd %%xmm11, %%xmm11 \n\t"
"movsd %%xmm10, (%2) \n\t"
"movsd %%xmm11,8(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
"r" (x) // 5
: "cc",
"%xmm4", "%xmm5", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorpd %%xmm9 , %%xmm9 \n\t"
"xorpd %%xmm10 , %%xmm10 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
"mulpd %%xmm11 , %%xmm12 \n\t"
"addq $2 , %0 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups 16(%3,%0,8) , %%xmm14 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
"movups 16(%4,%0,8) , %%xmm13 \n\t"
"mulpd %%xmm11 , %%xmm12 \n\t"
"mulpd %%xmm13 , %%xmm14 \n\t"
"addq $4 , %0 \n\t"
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
"addpd %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"addpd %%xmm9 , %%xmm10 \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
"movsd %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
: "cc",
"%xmm9", "%xmm10" ,
"%xmm11", "%xmm12", "%xmm13", "%xmm14",
"memory"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
i=0;
__asm__ __volatile__
(
"movsd (%2) , %%xmm10 \n\t"
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
"mulpd %%xmm10 , %%xmm12 \n\t"
"addq $2 , %0 \n\t"
"addpd %%xmm12 , %%xmm11 \n\t"
"subq $2 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
: "cc",
"%xmm10", "%xmm11", "%xmm12",
"memory"
);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + NBMAX;
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

View File

@ -0,0 +1,127 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
// "prefetcht0 384(%4,%0,8) \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
// "prefetcht0 384(%5,%0,8) \n\t"
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
// "prefetcht0 384(%6,%0,8) \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"addq $8 , %0 \n\t"
// "prefetcht0 384(%7,%0,8) \n\t"
"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
"subq $8 , %1 \n\t"
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
"vextractf128 $1 , %%ymm7, %%xmm15 \n\t"
"vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t"
"vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t"
"vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t"
"vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t"
"vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddpd %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddpd %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddpd %%xmm7, %%xmm7, %%xmm7 \n\t"
"vmovsd %%xmm4, (%3) \n\t"
"vmovsd %%xmm5, 8(%3) \n\t"
"vmovsd %%xmm6, 16(%3) \n\t"
"vmovsd %%xmm7, 24(%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

591
kernel/x86_64/sgemv_n_4.c Normal file
View File

@ -0,0 +1,591 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_n_microk_sandy-4.c"
#elif defined(HASWELL)
#include "sgemv_n_microk_haswell-4.c"
#endif
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x8
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT *b0,*b1,*b2,*b3;
FLOAT *x4;
FLOAT x[8];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x4 = x + 4;
for ( i=0; i<8; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x4
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4];
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x2
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2) , %%xmm12 \n\t" // x0
"movss (%6) , %%xmm4 \n\t" // alpha
"movss 4(%2) , %%xmm13 \n\t" // x1
"mulss %%xmm4 , %%xmm12 \n\t" // alpha
"mulss %%xmm4 , %%xmm13 \n\t" // alpha
"shufps $0, %%xmm12, %%xmm12 \n\t"
"shufps $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm9 , %%xmm4 \n\t"
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
#ifndef HAVE_KERNEL_4x2
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
BLASLONG register n1 = n & -8 ;
BLASLONG register n2 = n & 4 ;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"mulss (%6), %%xmm12 \n\t" // alpha
"shufps $0, %%xmm12, %%xmm12 \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
"movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm12, %%xmm9 \n\t"
"addps %%xmm4 , %%xmm8 \n\t"
"addps %%xmm5 , %%xmm9 \n\t"
"addq $8 , %0 \n\t"
"movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y
"movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"testq $0x04, %5 \n\t"
"jz .L08LABEL%= \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
"mulps %%xmm12, %%xmm8 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
:
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
"r" (n2), // 5
"r" (alpha) // 6
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
i=0;
__asm__ __volatile__
(
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4) , %%xmm12 \n\t"
"movups (%3,%0,4) , %%xmm11 \n\t"
"addps %%xmm12 , %%xmm11 \n\t"
"addq $4 , %0 \n\t"
"movups %%xmm11, -16(%3,%0,4) \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",
"%xmm10", "%xmm11", "%xmm12",
"memory"
);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;
}
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*4);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if ( n2 & 4 )
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

View File

@ -1,218 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_n_microk_bulldozer.c"
#elif defined(HASWELL)
#include "sgemv_n_microk_haswell.c"
#else
#include "sgemv_n_microk_sandy.c"
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG register m2;
BLASLONG register n2;
FLOAT *xbuffer,*ybuffer;
xbuffer = buffer;
ybuffer = xbuffer + 2048 + 256;
n1 = n / 512 ;
n2 = n % 512 ;
m1 = m / 64;
m2 = m % 64;
y_ptr = y;
x_ptr = x;
for (j=0; j<n1; j++)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(512,x_ptr,xbuffer,inc_x);
a_ptr = a + j * 512 * lda;
y_ptr = y;
for(i = 0; i<m1; i++ )
{
sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(64,ybuffer,y_ptr,inc_y);
y_ptr += 64 * inc_y;
a_ptr += 64;
}
if ( m2 & 32 )
{
sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(32,ybuffer,y_ptr,inc_y);
y_ptr += 32 * inc_y;
a_ptr += 32;
}
if ( m2 & 16 )
{
sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(16,ybuffer,y_ptr,inc_y);
y_ptr += 16 * inc_y;
a_ptr += 16;
}
if ( m2 & 8 )
{
sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(8,ybuffer,y_ptr,inc_y);
y_ptr += 8 * inc_y;
a_ptr += 8;
}
if ( m2 & 4 )
{
sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(4,ybuffer,y_ptr,inc_y);
y_ptr += 4 * inc_y;
a_ptr += 4;
}
if ( m2 & 2 )
{
sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(2,ybuffer,y_ptr,inc_y);
y_ptr += 2 * inc_y;
a_ptr += 2;
}
if ( m2 & 1 )
{
sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(1,ybuffer,y_ptr,inc_y);
}
x_ptr += 512 * inc_x;
}
if ( n2 > 0 )
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(n2,x_ptr,xbuffer,inc_x);
a_ptr = a + n1 * 512 * lda;
y_ptr = y;
for(i = 0; i<m1; i++ )
{
sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(64,ybuffer,y_ptr,inc_y);
y_ptr += 64 * inc_y;
a_ptr += 64;
}
if ( m2 & 32 )
{
sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(32,ybuffer,y_ptr,inc_y);
y_ptr += 32 * inc_y;
a_ptr += 32;
}
if ( m2 & 16 )
{
sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(16,ybuffer,y_ptr,inc_y);
y_ptr += 16 * inc_y;
a_ptr += 16;
}
if ( m2 & 8 )
{
sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(8,ybuffer,y_ptr,inc_y);
y_ptr += 8 * inc_y;
a_ptr += 8;
}
if ( m2 & 4 )
{
sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(4,ybuffer,y_ptr,inc_y);
y_ptr += 4 * inc_y;
a_ptr += 4;
}
if ( m2 & 2 )
{
sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(2,ybuffer,y_ptr,inc_y);
y_ptr += 2 * inc_y;
a_ptr += 2;
}
if ( m2 & 1 )
{
sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
add_y(1,ybuffer,y_ptr,inc_y);
}
}
return(0);
}

View File

@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"addq $8 , %8 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t"
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
"addq $16, %8 \n\t"
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
"vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,451 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
"prefetcht0 128(%%r8)\n\t" // Prefetch
"vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
"prefetcht0 192(%%r8)\n\t" // Prefetch
"vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero
"vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero
"vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero
"vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
"vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha
"vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha
"vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha
"vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha
"vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha
"vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
);
}
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8 , %8 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $16, %8 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,461 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*2;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
"vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
"vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
"prefetcht0 128(%%r8)\n\t" // Prefetch
"vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
"vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
"prefetcht0 192(%%r8)\n\t" // Prefetch
"vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
"vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,204 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
"movss 16(%2), %%xmm0 \n\t" // x4
"movss 20(%2), %%xmm1 \n\t" // x5
"movss 24(%2), %%xmm2 \n\t" // x6
"movss 28(%2), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
"shufps $0, %%xmm3 , %%xmm3 \n\t"
"movss (%9), %%xmm6 \n\t" // alpha
"shufps $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
".align 2 \n\t"
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
".align 2 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"mulps %%xmm14, %%xmm10 \n\t"
"mulps %%xmm15, %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"movups (%4,%8,4), %%xmm8 \n\t"
"movups (%5,%8,4), %%xmm9 \n\t"
"movups (%6,%8,4), %%xmm10 \n\t"
"movups (%7,%8,4), %%xmm11 \n\t"
".align 2 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
"mulps %%xmm2 , %%xmm10 \n\t"
"mulps %%xmm3 , %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
"movss (%8), %%xmm6 \n\t" // alpha
"shufps $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
"mulps %%xmm14, %%xmm10 \n\t"
"mulps %%xmm15, %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm9 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm10 , %%xmm4 \n\t"
"addps %%xmm4 , %%xmm11 \n\t"
"mulps %%xmm6 , %%xmm11 \n\t"
"addps %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,370 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4, %8 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8, %8 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
"addq $16, %8 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y
"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t"
"vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t"
"vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,473 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*2;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"prefetcht0 128(%%r8)\n\t" // Prefetch
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
"prefetcht0 192(%%r8)\n\t" // Prefetch
"vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
float *pre = a + lda*3;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"movq %6, %%r8\n\t" // address for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"prefetcht0 64(%%r8)\n\t" // Prefetch
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"nop \n\t"
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
"prefetcht0 (%%r8)\n\t" // Prefetch
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y), // 5
"m" (pre) // 6
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
"addq $4 , %%rdi \n\t" // increment pointer of c
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
{
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
".L01LOOP%=: \n\t"
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
"addq $4 , %%rdi \n\t" // increment pointer of c
"vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
"vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

624
kernel/x86_64/sgemv_t_4.c Normal file
View File

@ -0,0 +1,624 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(NEHALEM)
#include "sgemv_t_microk_nehalem-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c"
#elif defined(HASWELL)
#include "sgemv_t_microk_haswell-4.c"
#endif
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x4
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorps %%xmm10 , %%xmm10 \n\t"
"xorps %%xmm11 , %%xmm11 \n\t"
"testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
"movups (%4,%0,4) , %%xmm13 \n\t" // ap1
"mulps %%xmm14 , %%xmm12 \n\t"
"mulps %%xmm14 , %%xmm13 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
"movups (%4,%0,4) , %%xmm13 \n\t" // ap1
"mulps %%xmm14 , %%xmm12 \n\t"
"mulps %%xmm14 , %%xmm13 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
"movups 16(%5,%0,4) , %%xmm14 \n\t" // x
"movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0
"movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1
"mulps %%xmm14 , %%xmm12 \n\t"
"mulps %%xmm14 , %%xmm13 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm11, %%xmm11 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm11, %%xmm11 \n\t"
"movss %%xmm10, (%2) \n\t"
"movss %%xmm11,4(%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
"r" (x) // 5
: "cc",
"%xmm4", "%xmm5", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
i=0;
__asm__ __volatile__
(
"xorps %%xmm9 , %%xmm9 \n\t"
"xorps %%xmm10 , %%xmm10 \n\t"
"testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
"mulps %%xmm11 , %%xmm12 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
".L01LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups 16(%3,%0,4) , %%xmm14 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
"movups 16(%4,%0,4) , %%xmm13 \n\t"
"mulps %%xmm11 , %%xmm12 \n\t"
"mulps %%xmm13 , %%xmm14 \n\t"
"addq $8 , %0 \n\t"
"addps %%xmm12 , %%xmm10 \n\t"
"subq $8 , %1 \n\t"
"addps %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
".L01END%=: \n\t"
"addps %%xmm9 , %%xmm10 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"movss %%xmm10, (%2) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
: "cc",
"%xmm9", "%xmm10" ,
"%xmm11", "%xmm12", "%xmm13", "%xmm14",
"memory"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
i=0;
__asm__ __volatile__
(
"movss (%2) , %%xmm10 \n\t"
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
"mulps %%xmm10 , %%xmm12 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm12 , %%xmm11 \n\t"
"subq $4 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
: "cc",
"%xmm10", "%xmm11", "%xmm12",
"memory"
);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + NBMAX;
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

View File

@ -1,232 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#include "sgemv_t_microk_bulldozer.c"
#elif defined(HASWELL)
#include "sgemv_t_microk_haswell.c"
#else
#include "sgemv_t_microk_sandy.c"
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
{
FLOAT register temp0 = 0.0;
BLASLONG i;
for ( i=0; i<n ; i++)
{
temp0 += a[i] * x[i];
}
temp0 *= alpha ;
*y += temp0;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *a_ptrl;
BLASLONG m1;
BLASLONG register m2;
FLOAT *xbuffer;
xbuffer = buffer;
BLASLONG register Mblock;
m1 = m / 1024 ;
m2 = m % 1024 ;
x_ptr = x;
a_ptr = a;
for (j=0; j<m1; j++)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(1024,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += 1024;
x_ptr += 1024 * inc_x;
}
if ( m2 == 0 ) return(0);
Mblock = 512;
while ( Mblock >= 16 )
{
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
}
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
if ( m2 & Mblock)
{
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(Mblock,x_ptr,xbuffer,inc_x);
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
a_ptr += Mblock;
x_ptr += Mblock * inc_x;
}
Mblock /= 2;
if ( m2 & Mblock)
{
xbuffer = x_ptr;
y_ptr = y;
a_ptrl = a_ptr;
for(i = 0; i<n; i++ )
{
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
y_ptr += inc_y;
a_ptrl += lda;
}
}
return(0);
}

View File

@ -0,0 +1,147 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"prefetcht0 384(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"prefetcht0 384(%5,%0,4) \n\t"
".align 2 \n\t"
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
"prefetcht0 384(%6,%0,4) \n\t"
".align 2 \n\t"
"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t"
"vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"prefetcht0 384(%7,%0,4) \n\t"
"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t"
"addq $16, %0 \n\t"
"vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t"
"subq $16, %1 \n\t"
"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vmovss %%xmm4, (%3) \n\t"
"vmovss %%xmm5, 4(%3) \n\t"
"vmovss %%xmm6, 8(%3) \n\t"
"vmovss %%xmm7, 12(%3) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,99 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
//n = n / 16;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
"sarq $4, %%rax \n\t" // n = n / 16
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 512(%%rsi) \n\t"
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
"vmovups (%%rsi), %%xmm4 \n\t"
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
"vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
"vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
"addq $16*4 , %%rsi \n\t" // increment pointer of a
"addq $16*4 , %%rdi \n\t" // increment pointer of c
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,148 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
"prefetcht0 384(%4,%0,4) \n\t"
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t"
"prefetcht0 384(%5,%0,4) \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"prefetcht0 384(%6,%0,4) \n\t"
"vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t"
"prefetcht0 384(%7,%0,4) \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
"vextractf128 $1 , %%ymm7, %%xmm15 \n\t"
"vaddps %%xmm4, %%xmm12, %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm13, %%xmm5 \n\t"
"vaddps %%xmm6, %%xmm14, %%xmm6 \n\t"
"vaddps %%xmm7, %%xmm15, %%xmm7 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vmovss %%xmm4, (%3) \n\t"
"vmovss %%xmm5, 4(%3) \n\t"
"vmovss %%xmm6, 8(%3) \n\t"
"vmovss %%xmm7, 12(%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,100 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
//n = n / 16;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
"sarq $4, %%rax \n\t" // n = n / 16
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 512(%%rsi) \n\t"
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
"vmovups (%%rsi), %%xmm4 \n\t"
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
"vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
"vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
"vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
"vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
"addq $16*4 , %%rsi \n\t" // increment pointer of a
"addq $16*4 , %%rdi \n\t" // increment pointer of c
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t"
"vaddss (%%rdx), %%xmm12,%%xmm12\n\t"
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -0,0 +1,99 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"xorps %%xmm6 , %%xmm6 \n\t"
"xorps %%xmm7 , %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0
"movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1
"movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2
"movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm12, %%xmm9 \n\t"
"mulps %%xmm12, %%xmm10 \n\t"
"mulps %%xmm12, %%xmm11 \n\t"
"addps %%xmm8 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"addps %%xmm9 , %%xmm5 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm10, %%xmm6 \n\t"
"addps %%xmm11, %%xmm7 \n\t"
"jnz .L01LOOP%= \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"haddps %%xmm5, %%xmm5 \n\t"
"haddps %%xmm6, %%xmm6 \n\t"
"haddps %%xmm7, %%xmm7 \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"haddps %%xmm5, %%xmm5 \n\t"
"haddps %%xmm6, %%xmm6 \n\t"
"haddps %%xmm7, %%xmm7 \n\t"
"movss %%xmm4, (%3) \n\t"
"movss %%xmm5, 4(%3) \n\t"
"movss %%xmm6, 8(%3) \n\t"
"movss %%xmm7, 12(%3) \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12",
"memory"
);
}

View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t"
"vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t"
"vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t"
"vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t"
"subq $4 , %1 \n\t"
"vaddps %%xmm7, %%xmm11, %%xmm7 \n\t"
".L08LABEL%=: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t"
"subq $8 , %1 \n\t"
"vaddps %%ymm7, %%ymm11, %%ymm7 \n\t"
".L16LABEL%=: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
"prefetcht0 384(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t"
"prefetcht0 384(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t"
"vaddps %%ymm1, %%ymm10, %%ymm1 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
"prefetcht0 384(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t"
"prefetcht0 384(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t"
"addq $16, %0 \n\t"
"vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t"
"vaddps %%ymm7, %%ymm10, %%ymm7 \n\t"
"subq $16, %1 \n\t"
"vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"
"jnz .L01LOOP%= \n\t"
".L16END%=: \n\t"
"vaddps %%ymm4, %%ymm0, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm1, %%ymm5 \n\t"
"vaddps %%ymm6, %%ymm2, %%ymm6 \n\t"
"vaddps %%ymm7, %%ymm3, %%ymm7 \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
"vextractf128 $1 , %%ymm7, %%xmm15 \n\t"
"vaddps %%xmm4, %%xmm12, %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm13, %%xmm5 \n\t"
"vaddps %%xmm6, %%xmm14, %%xmm6 \n\t"
"vaddps %%xmm7, %%xmm15, %%xmm7 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
"vmovss %%xmm4, (%3) \n\t"
"vmovss %%xmm5, 4(%3) \n\t"
"vmovss %%xmm6, 8(%3) \n\t"
"vmovss %%xmm7, 12(%3) \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,106 +0,0 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
{
//n = n / 16;
__asm__ __volatile__
(
"movq %0, %%rax\n\t" // n -> rax
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
"movq %2, %%rsi\n\t" // adress of a -> rsi
"movq %3, %%rcx\n\t" // value of lda > rcx
"movq %4, %%rdi\n\t" // adress of x -> rdi
"movq %5, %%rdx\n\t" // adress of y -> rdx
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
"sarq $4, %%rax \n\t" // n = n / 16
".align 16 \n\t"
".L01LOOP%=: \n\t"
// "prefetcht0 512(%%rsi) \n\t"
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
"vmovups (%%rsi), %%xmm4 \n\t"
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
"vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
"vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
"vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
"vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp
"vaddps %%xmm12, %%xmm8 , %%xmm12\n\t"
"vaddps %%xmm13, %%xmm9 , %%xmm13\n\t"
"vaddps %%xmm14, %%xmm10, %%xmm14\n\t"
"vaddps %%xmm15, %%xmm11, %%xmm15\n\t"
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
"addq $16*4 , %%rsi \n\t" // increment pointer of a
"addq $16*4 , %%rdi \n\t" // increment pointer of c
"dec %%rax \n\t" // n = n -1
"jnz .L01LOOP%= \n\t"
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
"vmulss %%xmm12, %%xmm1, %%xmm12 \n\t"
"vaddss (%%rdx), %%xmm12, %%xmm12\n\t"
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
:
:
"m" (n), // 0
"m" (alpha), // 1
"m" (a), // 2
"m" (lda), // 3
"m" (x), // 4
"m" (y) // 5
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -1,6 +1,6 @@
Data file for testing DSGESV/DSPOSV LAPACK routines Data file for testing DSGESV/DSPOSV LAPACK routines
12 Number of values of M 12 Number of values of M
0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension) 0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension)
6 Number of values of NRHS 6 Number of values of NRHS
1 2 14 15 16 13 Values of NRHS (number of right hand sides) 1 2 14 15 16 13 Values of NRHS (number of right hand sides)
30.0 Threshold value of test ratio 30.0 Threshold value of test ratio