Merge pull request #445 from wernsaar/develop
A lot of optimizations for gemv kernels
This commit is contained in:
commit
d13e92f07e
|
@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
|
|||
blasint inc_x=1,inc_y=1;
|
||||
blasint n=0;
|
||||
int has_param_n = 0;
|
||||
int has_param_m = 0;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){
|
|||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
|
||||
int tomax = to;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||
if ((p = getenv("OPENBLAS_PARAM_N"))) {
|
||||
n = atoi(p);
|
||||
if ((n>0) && (n<=to)) has_param_n = 1;
|
||||
if ((n>0)) has_param_n = 1;
|
||||
if ( n > tomax ) tomax = n;
|
||||
}
|
||||
if ( has_param_n == 0 )
|
||||
if ((p = getenv("OPENBLAS_PARAM_M"))) {
|
||||
m = atoi(p);
|
||||
if ((m>0)) has_param_m = 1;
|
||||
if ( m > tomax ) tomax = m;
|
||||
}
|
||||
|
||||
if ( has_param_n == 1 )
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
|
||||
else
|
||||
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
|
@ -177,22 +187,20 @@ int MAIN__(int argc, char *argv[]){
|
|||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
if (has_param_m == 0)
|
||||
{
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
timeg=0;
|
||||
|
||||
if ( has_param_n == 0 ) n = m;
|
||||
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
|
@ -204,24 +212,56 @@ int MAIN__(int argc, char *argv[]){
|
|||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for(n = from; n <= to; n += step)
|
||||
{
|
||||
timeg=0;
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
# **********************************************************************************
|
||||
# Copyright (c) 2014, The OpenBLAS Project
|
||||
# All rights reserved.
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# 3. Neither the name of the OpenBLAS project nor the names of
|
||||
# its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# **********************************************************************************
|
||||
|
||||
set term x11 font sans;
|
||||
set ylabel "MFlops";
|
||||
set xlabel "Size";
|
||||
set grid xtics;
|
||||
set grid ytics;
|
||||
set key left;
|
||||
set timestamp "generated on %Y-%m-%d by `whoami`"
|
||||
set title "Sgemv\nTRANS=T\nBulldozer"
|
||||
plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
|
||||
set output "print.png";
|
||||
show title;
|
||||
show plot;
|
||||
show output;
|
||||
|
||||
|
|
@ -46,6 +46,7 @@
|
|||
#define __volatile__
|
||||
#endif
|
||||
|
||||
/*
|
||||
#ifdef HAVE_SSE2
|
||||
#define MB __asm__ __volatile__ ("mfence");
|
||||
#define WMB __asm__ __volatile__ ("sfence");
|
||||
|
@ -53,6 +54,10 @@
|
|||
#define MB
|
||||
#define WMB
|
||||
#endif
|
||||
*/
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
|||
: "0" (op));
|
||||
}
|
||||
|
||||
/*
|
||||
#define WHEREAMI
|
||||
*/
|
||||
|
||||
static inline int WhereAmI(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
|
@ -111,6 +118,7 @@ static inline int WhereAmI(void){
|
|||
return apicid;
|
||||
}
|
||||
|
||||
|
||||
#ifdef CORE_BARCELONA
|
||||
#define IFLUSH gotoblas_iflush()
|
||||
#define IFLUSH_HALF gotoblas_iflush_half()
|
||||
|
|
|
@ -251,7 +251,11 @@ void blas_set_parameter(void){
|
|||
|
||||
env_var_t p;
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
#endif
|
||||
|
||||
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
|
||||
size >>= 7;
|
||||
|
|
|
@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
int nthreads_avail = nthreads_max;
|
||||
|
||||
double MNK = (double) m * (double) n;
|
||||
if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
|
||||
nthreads_max = 1;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
|
|
|
@ -10,8 +10,8 @@ DSYMV_L_KERNEL = dsymv_L.c
|
|||
SSYMV_U_KERNEL = ssymv_U.c
|
||||
SSYMV_L_KERNEL = ssymv_L.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.c
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n.c
|
||||
ZGEMVTKERNEL = zgemv_t.c
|
||||
|
|
|
@ -9,9 +9,9 @@ DSYMV_L_KERNEL = dsymv_L.c
|
|||
SSYMV_U_KERNEL = ssymv_U.c
|
||||
SSYMV_L_KERNEL = ssymv_L.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
|
||||
SGEMMINCOPY = gemm_ncopy_4.S
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
SGEMVNKERNEL = sgemv_n.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n.c
|
||||
|
||||
|
|
|
@ -0,0 +1,548 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(NEHALEM)
|
||||
#include "dgemv_n_microk_nehalem-4.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "dgemv_n_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
#ifndef HAVE_KERNEL_4x8
|
||||
|
||||
static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT *b0,*b1,*b2,*b3;
|
||||
FLOAT *x4;
|
||||
FLOAT x[8];
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
b0 = a0 + lda4 ;
|
||||
b1 = a1 + lda4 ;
|
||||
b2 = a2 + lda4 ;
|
||||
b3 = a3 + lda4 ;
|
||||
x4 = x + 4;
|
||||
|
||||
for ( i=0; i<8; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
|
||||
y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
|
||||
y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
|
||||
y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
|
||||
y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT x[4];
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movsd (%2) , %%xmm12 \n\t" // x0
|
||||
"movsd (%6) , %%xmm4 \n\t" // alpha
|
||||
"movsd 8(%2) , %%xmm13 \n\t" // x1
|
||||
"mulsd %%xmm4 , %%xmm12 \n\t" // alpha
|
||||
"mulsd %%xmm4 , %%xmm13 \n\t" // alpha
|
||||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||
"shufpd $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||
|
||||
"movups (%4,%0,8), %%xmm8 \n\t"
|
||||
"movups (%5,%0,8), %%xmm9 \n\t"
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm13, %%xmm9 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm4 \n\t"
|
||||
|
||||
"movups 16(%4,%0,8), %%xmm8 \n\t"
|
||||
"movups 16(%5,%0,8), %%xmm9 \n\t"
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm13, %%xmm9 \n\t"
|
||||
"addpd %%xmm8 , %%xmm5 \n\t"
|
||||
"addpd %%xmm9 , %%xmm5 \n\t"
|
||||
|
||||
"movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
|
||||
"movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (alpha) // 6
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movsd (%2), %%xmm12 \n\t" // x0
|
||||
"mulsd (%5), %%xmm12 \n\t" // alpha
|
||||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm12, %%xmm9 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm5 \n\t"
|
||||
|
||||
"movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
|
||||
"movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
"r" (alpha) // 5
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
BLASLONG lda8 = lda << 3;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
n1 = n >> 3 ;
|
||||
n2 = n & 7 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
}
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*8);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
|
||||
ap[0] += lda8;
|
||||
ap[1] += lda8;
|
||||
ap[2] += lda8;
|
||||
ap[3] += lda8;
|
||||
a_ptr += lda8;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 4 )
|
||||
{
|
||||
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,247 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L8LABEL%= \n\t"
|
||||
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L8LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
||||
"subq $8 , %1 \n\t"
|
||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
|
||||
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L8LABEL%= \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L8LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L8END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
|
||||
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L8END%=: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,265 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movsd (%2), %%xmm12 \n\t" // x0
|
||||
"movsd 8(%2), %%xmm13 \n\t" // x1
|
||||
"movsd 16(%2), %%xmm14 \n\t" // x2
|
||||
"movsd 24(%2), %%xmm15 \n\t" // x3
|
||||
"shufpd $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufpd $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufpd $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufpd $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
"movsd 32(%2), %%xmm0 \n\t" // x4
|
||||
"movsd 40(%2), %%xmm1 \n\t" // x5
|
||||
"movsd 48(%2), %%xmm2 \n\t" // x6
|
||||
"movsd 56(%2), %%xmm3 \n\t" // x7
|
||||
"shufpd $0, %%xmm0 , %%xmm0 \n\t"
|
||||
"shufpd $0, %%xmm1 , %%xmm1 \n\t"
|
||||
"shufpd $0, %%xmm2 , %%xmm2 \n\t"
|
||||
"shufpd $0, %%xmm3 , %%xmm3 \n\t"
|
||||
|
||||
"movsd (%9), %%xmm6 \n\t" // alpha
|
||||
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
||||
".align 2 \n\t"
|
||||
"movups (%4,%0,8), %%xmm8 \n\t"
|
||||
"movups (%5,%0,8), %%xmm9 \n\t"
|
||||
"movups (%6,%0,8), %%xmm10 \n\t"
|
||||
"movups (%7,%0,8), %%xmm11 \n\t"
|
||||
".align 2 \n\t"
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm13, %%xmm9 \n\t"
|
||||
"mulpd %%xmm14, %%xmm10 \n\t"
|
||||
"mulpd %%xmm15, %%xmm11 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm5 \n\t"
|
||||
"addpd %%xmm10, %%xmm4 \n\t"
|
||||
"addpd %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"movups (%4,%8,8), %%xmm8 \n\t"
|
||||
"movups (%5,%8,8), %%xmm9 \n\t"
|
||||
"movups (%6,%8,8), %%xmm10 \n\t"
|
||||
"movups (%7,%8,8), %%xmm11 \n\t"
|
||||
".align 2 \n\t"
|
||||
"mulpd %%xmm0 , %%xmm8 \n\t"
|
||||
"mulpd %%xmm1 , %%xmm9 \n\t"
|
||||
"mulpd %%xmm2 , %%xmm10 \n\t"
|
||||
"mulpd %%xmm3 , %%xmm11 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm5 \n\t"
|
||||
"addpd %%xmm10, %%xmm4 \n\t"
|
||||
"addpd %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"addpd %%xmm5 , %%xmm4 \n\t"
|
||||
"mulpd %%xmm6 , %%xmm4 \n\t"
|
||||
"addpd %%xmm4 , %%xmm7 \n\t"
|
||||
|
||||
"movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
||||
".align 2 \n\t"
|
||||
"movups 16(%4,%0,8), %%xmm8 \n\t"
|
||||
"movups 16(%5,%0,8), %%xmm9 \n\t"
|
||||
"movups 16(%6,%0,8), %%xmm10 \n\t"
|
||||
"movups 16(%7,%0,8), %%xmm11 \n\t"
|
||||
".align 2 \n\t"
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm13, %%xmm9 \n\t"
|
||||
"mulpd %%xmm14, %%xmm10 \n\t"
|
||||
"mulpd %%xmm15, %%xmm11 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm5 \n\t"
|
||||
"addpd %%xmm10, %%xmm4 \n\t"
|
||||
"addpd %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"movups 16(%4,%8,8), %%xmm8 \n\t"
|
||||
"movups 16(%5,%8,8), %%xmm9 \n\t"
|
||||
"movups 16(%6,%8,8), %%xmm10 \n\t"
|
||||
"movups 16(%7,%8,8), %%xmm11 \n\t"
|
||||
".align 2 \n\t"
|
||||
"mulpd %%xmm0 , %%xmm8 \n\t"
|
||||
"mulpd %%xmm1 , %%xmm9 \n\t"
|
||||
"mulpd %%xmm2 , %%xmm10 \n\t"
|
||||
"mulpd %%xmm3 , %%xmm11 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm5 \n\t"
|
||||
"addpd %%xmm10, %%xmm4 \n\t"
|
||||
"addpd %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addpd %%xmm5 , %%xmm4 \n\t"
|
||||
"mulpd %%xmm6 , %%xmm4 \n\t"
|
||||
"addpd %%xmm4 , %%xmm7 \n\t"
|
||||
|
||||
"movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movsd (%2), %%xmm12 \n\t" // x0
|
||||
"movsd 8(%2), %%xmm13 \n\t" // x1
|
||||
"movsd 16(%2), %%xmm14 \n\t" // x2
|
||||
"movsd 24(%2), %%xmm15 \n\t" // x3
|
||||
"shufpd $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufpd $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufpd $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufpd $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
"movsd (%8), %%xmm6 \n\t" // alpha
|
||||
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
||||
"movups (%4,%0,8), %%xmm8 \n\t"
|
||||
"movups (%5,%0,8), %%xmm9 \n\t"
|
||||
"movups (%6,%0,8), %%xmm10 \n\t"
|
||||
"movups (%7,%0,8), %%xmm11 \n\t"
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm13, %%xmm9 \n\t"
|
||||
"mulpd %%xmm14, %%xmm10 \n\t"
|
||||
"mulpd %%xmm15, %%xmm11 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm4 \n\t"
|
||||
"addpd %%xmm10 , %%xmm4 \n\t"
|
||||
"addpd %%xmm4 , %%xmm11 \n\t"
|
||||
|
||||
"mulpd %%xmm6 , %%xmm11 \n\t"
|
||||
"addpd %%xmm7 , %%xmm11 \n\t"
|
||||
"movups %%xmm11, (%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
||||
"movups 16(%4,%0,8), %%xmm8 \n\t"
|
||||
"movups 16(%5,%0,8), %%xmm9 \n\t"
|
||||
"movups 16(%6,%0,8), %%xmm10 \n\t"
|
||||
"movups 16(%7,%0,8), %%xmm11 \n\t"
|
||||
"mulpd %%xmm12, %%xmm8 \n\t"
|
||||
"mulpd %%xmm13, %%xmm9 \n\t"
|
||||
"mulpd %%xmm14, %%xmm10 \n\t"
|
||||
"mulpd %%xmm15, %%xmm11 \n\t"
|
||||
"addpd %%xmm8 , %%xmm4 \n\t"
|
||||
"addpd %%xmm9 , %%xmm4 \n\t"
|
||||
"addpd %%xmm10 , %%xmm4 \n\t"
|
||||
"addpd %%xmm4 , %%xmm11 \n\t"
|
||||
|
||||
"mulpd %%xmm6 , %%xmm11 \n\t"
|
||||
"addpd %%xmm7 , %%xmm11 \n\t"
|
||||
"movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,615 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(HASWELL)
|
||||
#include "dgemv_t_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
FLOAT temp3 = 0.0;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
|
||||
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
|
||||
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
y[1] = temp1;
|
||||
y[2] = temp2;
|
||||
y[3] = temp3;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorpd %%xmm10 , %%xmm10 \n\t"
|
||||
"xorpd %%xmm11 , %%xmm11 \n\t"
|
||||
|
||||
"testq $2 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
"movups (%4,%0,8) , %%xmm13 \n\t" // ap1
|
||||
"mulpd %%xmm14 , %%xmm12 \n\t"
|
||||
"mulpd %%xmm14 , %%xmm13 \n\t"
|
||||
"addq $2 , %0 \n\t"
|
||||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $2 , %1 \n\t"
|
||||
"addpd %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
"movups (%4,%0,8) , %%xmm13 \n\t" // ap1
|
||||
"mulpd %%xmm14 , %%xmm12 \n\t"
|
||||
"mulpd %%xmm14 , %%xmm13 \n\t"
|
||||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"addpd %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
"movups 16(%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
"movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1
|
||||
"mulpd %%xmm14 , %%xmm12 \n\t"
|
||||
"mulpd %%xmm14 , %%xmm13 \n\t"
|
||||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"addpd %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
|
||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||
"haddpd %%xmm11, %%xmm11 \n\t"
|
||||
|
||||
"movsd %%xmm10, (%2) \n\t"
|
||||
"movsd %%xmm11,8(%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
"r" (x) // 5
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorpd %%xmm9 , %%xmm9 \n\t"
|
||||
"xorpd %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
"testq $2 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
"mulpd %%xmm11 , %%xmm12 \n\t"
|
||||
"addq $2 , %0 \n\t"
|
||||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $2 , %1 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups 16(%3,%0,8) , %%xmm14 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
"movups 16(%4,%0,8) , %%xmm13 \n\t"
|
||||
"mulpd %%xmm11 , %%xmm12 \n\t"
|
||||
"mulpd %%xmm13 , %%xmm14 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addpd %%xmm14 , %%xmm9 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
|
||||
"addpd %%xmm9 , %%xmm10 \n\t"
|
||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||
|
||||
"movsd %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
: "cc",
|
||||
"%xmm9", "%xmm10" ,
|
||||
"%xmm11", "%xmm12", "%xmm13", "%xmm14",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
dest++;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movsd (%2) , %%xmm10 \n\t"
|
||||
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
"mulpd %%xmm10 , %%xmm12 \n\t"
|
||||
"addq $2 , %0 \n\t"
|
||||
"addpd %%xmm12 , %%xmm11 \n\t"
|
||||
"subq $2 , %1 \n\t"
|
||||
"movups %%xmm11, -16(%4,%0,8) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
: "cc",
|
||||
"%xmm10", "%xmm11", "%xmm12",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG register i;
|
||||
BLASLONG register j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n0;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[4],*xbuffer;
|
||||
FLOAT *ytemp;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
xbuffer = buffer;
|
||||
ytemp = buffer + NBMAX;
|
||||
|
||||
n0 = n / NBMAX;
|
||||
n1 = (n % NBMAX) >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(NB,x_ptr,xbuffer,inc_x);
|
||||
|
||||
|
||||
FLOAT *ap[4];
|
||||
FLOAT *yp;
|
||||
BLASLONG register lda4 = 4 * lda;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
BLASLONG nb1 = NBMAX / 4;
|
||||
for( j=0; j<n0; j++)
|
||||
{
|
||||
|
||||
yp = ytemp;
|
||||
for( i = 0; i < nb1 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += nb1 * inc_y * 4;
|
||||
a_ptr += nb1 * lda4 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
yp = ytemp;
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += n1 * inc_y * 4;
|
||||
a_ptr += n1 * lda4 ;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
|
||||
dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
|
||||
a_ptr += lda * 2;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
|
||||
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
|
||||
a_ptr += lda;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 3 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 2 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
|
||||
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
|
||||
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if ( lda == 1 && inc_y == 1 )
|
||||
{
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j+1] += aj[j+1] * xtemp;
|
||||
y_ptr[j+2] += aj[j+2] * xtemp;
|
||||
y_ptr[j+3] += aj[j+3] * xtemp;
|
||||
}
|
||||
for ( ; j<n ; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp;
|
||||
aj += lda4 ;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t"
|
||||
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
// "prefetcht0 384(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
||||
|
||||
// "prefetcht0 384(%4,%0,8) \n\t"
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
// "prefetcht0 384(%5,%0,8) \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
|
||||
// "prefetcht0 384(%6,%0,8) \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
// "prefetcht0 384(%7,%0,8) \n\t"
|
||||
"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
|
||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
|
||||
"vextractf128 $1 , %%ymm7, %%xmm15 \n\t"
|
||||
|
||||
"vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t"
|
||||
"vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t"
|
||||
"vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t"
|
||||
"vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddpd %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddpd %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vmovsd %%xmm4, (%3) \n\t"
|
||||
"vmovsd %%xmm5, 8(%3) \n\t"
|
||||
"vmovsd %%xmm6, 16(%3) \n\t"
|
||||
"vmovsd %%xmm7, 24(%3) \n\t"
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,591 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "sgemv_n_microk_bulldozer-4.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "sgemv_n_microk_nehalem-4.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "sgemv_n_microk_sandy-4.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "sgemv_n_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifndef HAVE_KERNEL_4x8
|
||||
|
||||
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT *b0,*b1,*b2,*b3;
|
||||
FLOAT *x4;
|
||||
FLOAT x[8];
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
b0 = a0 + lda4 ;
|
||||
b1 = a1 + lda4 ;
|
||||
b2 = a2 + lda4 ;
|
||||
b3 = a3 + lda4 ;
|
||||
x4 = x + 4;
|
||||
|
||||
for ( i=0; i<8; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
|
||||
y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
|
||||
y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
|
||||
y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
|
||||
y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT x[4];
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2) , %%xmm12 \n\t" // x0
|
||||
"movss (%6) , %%xmm4 \n\t" // alpha
|
||||
"movss 4(%2) , %%xmm13 \n\t" // x1
|
||||
"mulss %%xmm4 , %%xmm12 \n\t" // alpha
|
||||
"mulss %%xmm4 , %%xmm13 \n\t" // alpha
|
||||
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
"movups (%5,%0,4), %%xmm9 \n\t"
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm13, %%xmm9 \n\t"
|
||||
"addps %%xmm8 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addps %%xmm9 , %%xmm4 \n\t"
|
||||
|
||||
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (alpha) // 6
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register n1 = n & -8 ;
|
||||
BLASLONG register n2 = n & 4 ;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2), %%xmm12 \n\t" // x0
|
||||
"mulss (%6), %%xmm12 \n\t" // alpha
|
||||
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||
"movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm12, %%xmm9 \n\t"
|
||||
"addps %%xmm4 , %%xmm8 \n\t"
|
||||
"addps %%xmm5 , %%xmm9 \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y
|
||||
"movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
|
||||
"testq $0x04, %5 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"addps %%xmm8 , %%xmm4 \n\t"
|
||||
"movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
"r" (n2), // 5
|
||||
"r" (alpha) // 6
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%2,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%3,%0,4) , %%xmm11 \n\t"
|
||||
"addps %%xmm12 , %%xmm11 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"movups %%xmm11, -16(%3,%0,4) \n\t"
|
||||
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (src), // 2
|
||||
"r" (dest) // 3
|
||||
: "cc",
|
||||
"%xmm10", "%xmm11", "%xmm12",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
BLASLONG lda8 = lda << 3;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
n1 = n >> 3 ;
|
||||
n2 = n & 7 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
}
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*4);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
|
||||
ap[0] += lda8;
|
||||
ap[1] += lda8;
|
||||
ap[2] += lda8;
|
||||
ap[3] += lda8;
|
||||
a_ptr += lda8;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 4 )
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -1,218 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "sgemv_n_microk_bulldozer.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "sgemv_n_microk_haswell.c"
|
||||
#else
|
||||
#include "sgemv_n_microk_sandy.c"
|
||||
#endif
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
dest++;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG register m2;
|
||||
BLASLONG register n2;
|
||||
FLOAT *xbuffer,*ybuffer;
|
||||
xbuffer = buffer;
|
||||
ybuffer = xbuffer + 2048 + 256;
|
||||
|
||||
n1 = n / 512 ;
|
||||
n2 = n % 512 ;
|
||||
|
||||
m1 = m / 64;
|
||||
m2 = m % 64;
|
||||
|
||||
y_ptr = y;
|
||||
x_ptr = x;
|
||||
|
||||
for (j=0; j<n1; j++)
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(512,x_ptr,xbuffer,inc_x);
|
||||
|
||||
a_ptr = a + j * 512 * lda;
|
||||
y_ptr = y;
|
||||
|
||||
for(i = 0; i<m1; i++ )
|
||||
{
|
||||
sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(64,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 64 * inc_y;
|
||||
a_ptr += 64;
|
||||
|
||||
}
|
||||
|
||||
if ( m2 & 32 )
|
||||
{
|
||||
sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(32,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 32 * inc_y;
|
||||
a_ptr += 32;
|
||||
|
||||
}
|
||||
|
||||
if ( m2 & 16 )
|
||||
{
|
||||
sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(16,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 16 * inc_y;
|
||||
a_ptr += 16;
|
||||
}
|
||||
if ( m2 & 8 )
|
||||
{
|
||||
sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(8,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 8 * inc_y;
|
||||
a_ptr += 8;
|
||||
}
|
||||
if ( m2 & 4 )
|
||||
{
|
||||
sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(4,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 4 * inc_y;
|
||||
a_ptr += 4;
|
||||
}
|
||||
if ( m2 & 2 )
|
||||
{
|
||||
sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(2,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 2 * inc_y;
|
||||
a_ptr += 2;
|
||||
}
|
||||
if ( m2 & 1 )
|
||||
{
|
||||
sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(1,ybuffer,y_ptr,inc_y);
|
||||
}
|
||||
x_ptr += 512 * inc_x;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 > 0 )
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(n2,x_ptr,xbuffer,inc_x);
|
||||
|
||||
a_ptr = a + n1 * 512 * lda;
|
||||
y_ptr = y;
|
||||
|
||||
for(i = 0; i<m1; i++ )
|
||||
{
|
||||
sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(64,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 64 * inc_y;
|
||||
a_ptr += 64;
|
||||
|
||||
}
|
||||
|
||||
if ( m2 & 32 )
|
||||
{
|
||||
sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(32,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 32 * inc_y;
|
||||
a_ptr += 32;
|
||||
|
||||
}
|
||||
if ( m2 & 16 )
|
||||
{
|
||||
sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(16,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 16 * inc_y;
|
||||
a_ptr += 16;
|
||||
}
|
||||
if ( m2 & 8 )
|
||||
{
|
||||
sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(8,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 8 * inc_y;
|
||||
a_ptr += 8;
|
||||
}
|
||||
if ( m2 & 4 )
|
||||
{
|
||||
sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(4,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 4 * inc_y;
|
||||
a_ptr += 4;
|
||||
}
|
||||
if ( m2 & 2 )
|
||||
{
|
||||
sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(2,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += 2 * inc_y;
|
||||
a_ptr += 2;
|
||||
}
|
||||
if ( m2 & 1 )
|
||||
{
|
||||
sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
|
||||
add_y(1,ybuffer,y_ptr,inc_y);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,269 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastss (%2), %%xmm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
"addq $4 , %8 \n\t"
|
||||
|
||||
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
|
||||
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"addq $8 , %8 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
|
||||
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
".align 2 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%8,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
|
||||
"addq $16, %8 \n\t"
|
||||
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
|
||||
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastss (%2), %%xmm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
|
||||
|
||||
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
|
||||
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
|
||||
"vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,451 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
|
||||
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
|
||||
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
|
||||
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
"vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 128(%%r8)\n\t" // Prefetch
|
||||
"vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 192(%%r8)\n\t" // Prefetch
|
||||
"vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
|
||||
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
|
||||
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
|
||||
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
|
||||
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
|
||||
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero
|
||||
"vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero
|
||||
"vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero
|
||||
"vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
|
||||
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
"vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha
|
||||
"vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha
|
||||
"vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha
|
||||
"vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha
|
||||
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
"vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
|
||||
"vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha
|
||||
"vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
|
||||
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
|
||||
"vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
|
||||
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
|
||||
"vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
|
||||
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,299 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $16, %8 \n\t"
|
||||
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
|
||||
"subq $16, %1 \n\t"
|
||||
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
|
||||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,461 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
float *pre = a + lda*2;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
|
||||
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
|
||||
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
|
||||
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
"vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 128(%%r8)\n\t" // Prefetch
|
||||
"vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 192(%%r8)\n\t" // Prefetch
|
||||
"vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
|
||||
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
|
||||
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
|
||||
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
|
||||
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
|
||||
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
|
||||
|
||||
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
|
||||
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
|
||||
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
|
||||
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
|
||||
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
|
||||
"vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2), %%xmm12 \n\t" // x0
|
||||
"movss 4(%2), %%xmm13 \n\t" // x1
|
||||
"movss 8(%2), %%xmm14 \n\t" // x2
|
||||
"movss 12(%2), %%xmm15 \n\t" // x3
|
||||
"shufps $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufps $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufps $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
"movss 16(%2), %%xmm0 \n\t" // x4
|
||||
"movss 20(%2), %%xmm1 \n\t" // x5
|
||||
"movss 24(%2), %%xmm2 \n\t" // x6
|
||||
"movss 28(%2), %%xmm3 \n\t" // x7
|
||||
"shufps $0, %%xmm0 , %%xmm0 \n\t"
|
||||
"shufps $0, %%xmm1 , %%xmm1 \n\t"
|
||||
"shufps $0, %%xmm2 , %%xmm2 \n\t"
|
||||
"shufps $0, %%xmm3 , %%xmm3 \n\t"
|
||||
|
||||
"movss (%9), %%xmm6 \n\t" // alpha
|
||||
"shufps $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
".align 2 \n\t"
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
"movups (%5,%0,4), %%xmm9 \n\t"
|
||||
"movups (%6,%0,4), %%xmm10 \n\t"
|
||||
"movups (%7,%0,4), %%xmm11 \n\t"
|
||||
".align 2 \n\t"
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm13, %%xmm9 \n\t"
|
||||
"mulps %%xmm14, %%xmm10 \n\t"
|
||||
"mulps %%xmm15, %%xmm11 \n\t"
|
||||
"addps %%xmm8 , %%xmm4 \n\t"
|
||||
"addps %%xmm9 , %%xmm5 \n\t"
|
||||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"movups (%4,%8,4), %%xmm8 \n\t"
|
||||
"movups (%5,%8,4), %%xmm9 \n\t"
|
||||
"movups (%6,%8,4), %%xmm10 \n\t"
|
||||
"movups (%7,%8,4), %%xmm11 \n\t"
|
||||
".align 2 \n\t"
|
||||
"mulps %%xmm0 , %%xmm8 \n\t"
|
||||
"mulps %%xmm1 , %%xmm9 \n\t"
|
||||
"mulps %%xmm2 , %%xmm10 \n\t"
|
||||
"mulps %%xmm3 , %%xmm11 \n\t"
|
||||
"addps %%xmm8 , %%xmm4 \n\t"
|
||||
"addps %%xmm9 , %%xmm5 \n\t"
|
||||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addps %%xmm5 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"mulps %%xmm6 , %%xmm4 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm4 , %%xmm7 \n\t"
|
||||
|
||||
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2), %%xmm12 \n\t" // x0
|
||||
"movss 4(%2), %%xmm13 \n\t" // x1
|
||||
"movss 8(%2), %%xmm14 \n\t" // x2
|
||||
"movss 12(%2), %%xmm15 \n\t" // x3
|
||||
"shufps $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufps $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufps $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
"movss (%8), %%xmm6 \n\t" // alpha
|
||||
"shufps $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
"movups (%5,%0,4), %%xmm9 \n\t"
|
||||
"movups (%6,%0,4), %%xmm10 \n\t"
|
||||
"movups (%7,%0,4), %%xmm11 \n\t"
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm13, %%xmm9 \n\t"
|
||||
"mulps %%xmm14, %%xmm10 \n\t"
|
||||
"mulps %%xmm15, %%xmm11 \n\t"
|
||||
"addps %%xmm8 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addps %%xmm9 , %%xmm4 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm10 , %%xmm4 \n\t"
|
||||
"addps %%xmm4 , %%xmm11 \n\t"
|
||||
|
||||
"mulps %%xmm6 , %%xmm11 \n\t"
|
||||
"addps %%xmm7 , %%xmm11 \n\t"
|
||||
"movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,370 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
|
||||
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
|
||||
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
|
||||
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
|
||||
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4, %8 \n\t"
|
||||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8, %8 \n\t"
|
||||
"addq $8, %0 \n\t"
|
||||
"subq $8, %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%4,%8,4) \n\t"
|
||||
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%8,4) \n\t"
|
||||
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%6,%8,4) \n\t"
|
||||
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
"prefetcht0 192(%7,%8,4) \n\t"
|
||||
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
|
||||
"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $16, %8 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
|
||||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
|
||||
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
|
||||
|
||||
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
|
||||
|
||||
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $8, %0 \n\t"
|
||||
"subq $8, %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
|
||||
"vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y
|
||||
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t"
|
||||
"vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t"
|
||||
|
||||
"vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y
|
||||
"vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,473 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
float *pre = a + lda*2;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
|
||||
"vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
|
||||
"vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
|
||||
"vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
"prefetcht0 128(%%r8)\n\t" // Prefetch
|
||||
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
|
||||
|
||||
"prefetcht0 192(%%r8)\n\t" // Prefetch
|
||||
"vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
|
||||
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
|
||||
"vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
|
||||
"vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
|
||||
"vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
|
||||
"vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
|
||||
|
||||
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
"vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
|
||||
"vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
float *pre = a + lda*3;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
"movq %6, %%r8\n\t" // address for prefetch
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
"prefetcht0 64(%%r8)\n\t" // Prefetch
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
"nop \n\t"
|
||||
"leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
|
||||
|
||||
"prefetcht0 (%%r8)\n\t" // Prefetch
|
||||
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
|
||||
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
"vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y), // 5
|
||||
"m" (pre) // 6
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
|
||||
|
||||
"vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
|
||||
"vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
|
||||
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
|
||||
"vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
|
||||
"vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
|
||||
"vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
"vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
|
||||
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
"vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovss (%%rdi), %%xmm0 \n\t" // load values of c
|
||||
"addq $4 , %%rdi \n\t" // increment pointer of c
|
||||
|
||||
"vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
|
||||
"vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
|
||||
|
||||
"leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
|
||||
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
|
||||
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,624 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(NEHALEM)
|
||||
#include "sgemv_t_microk_nehalem-4.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "sgemv_t_microk_bulldozer-4.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "sgemv_t_microk_sandy-4.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "sgemv_t_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
FLOAT temp3 = 0.0;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
|
||||
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
|
||||
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
y[1] = temp1;
|
||||
y[2] = temp2;
|
||||
y[3] = temp3;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorps %%xmm10 , %%xmm10 \n\t"
|
||||
"xorps %%xmm11 , %%xmm11 \n\t"
|
||||
|
||||
"testq $4 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
|
||||
"movups (%5,%0,4) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
|
||||
"movups (%4,%0,4) , %%xmm13 \n\t" // ap1
|
||||
"mulps %%xmm14 , %%xmm12 \n\t"
|
||||
"mulps %%xmm14 , %%xmm13 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addps %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%5,%0,4) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
|
||||
"movups (%4,%0,4) , %%xmm13 \n\t" // ap1
|
||||
"mulps %%xmm14 , %%xmm12 \n\t"
|
||||
"mulps %%xmm14 , %%xmm13 \n\t"
|
||||
"addps %%xmm12 , %%xmm10 \n\t"
|
||||
"addps %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
"movups 16(%5,%0,4) , %%xmm14 \n\t" // x
|
||||
"movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0
|
||||
"movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1
|
||||
"mulps %%xmm14 , %%xmm12 \n\t"
|
||||
"mulps %%xmm14 , %%xmm13 \n\t"
|
||||
"addps %%xmm12 , %%xmm10 \n\t"
|
||||
"addps %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
|
||||
"haddps %%xmm10, %%xmm10 \n\t"
|
||||
"haddps %%xmm11, %%xmm11 \n\t"
|
||||
"haddps %%xmm10, %%xmm10 \n\t"
|
||||
"haddps %%xmm11, %%xmm11 \n\t"
|
||||
|
||||
"movss %%xmm10, (%2) \n\t"
|
||||
"movss %%xmm11,4(%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
"r" (x) // 5
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorps %%xmm9 , %%xmm9 \n\t"
|
||||
"xorps %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
"testq $4 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,4) , %%xmm11 \n\t"
|
||||
"mulps %%xmm11 , %%xmm12 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addps %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
"movups 16(%3,%0,4) , %%xmm14 \n\t"
|
||||
"movups (%4,%0,4) , %%xmm11 \n\t"
|
||||
"movups 16(%4,%0,4) , %%xmm13 \n\t"
|
||||
"mulps %%xmm11 , %%xmm12 \n\t"
|
||||
"mulps %%xmm13 , %%xmm14 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"addps %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"addps %%xmm14 , %%xmm9 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
|
||||
"addps %%xmm9 , %%xmm10 \n\t"
|
||||
"haddps %%xmm10, %%xmm10 \n\t"
|
||||
"haddps %%xmm10, %%xmm10 \n\t"
|
||||
|
||||
"movss %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
: "cc",
|
||||
"%xmm9", "%xmm10" ,
|
||||
"%xmm11", "%xmm12", "%xmm13", "%xmm14",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
dest++;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movss (%2) , %%xmm10 \n\t"
|
||||
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,4) , %%xmm11 \n\t"
|
||||
"mulps %%xmm10 , %%xmm12 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addps %%xmm12 , %%xmm11 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"movups %%xmm11, -16(%4,%0,4) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
: "cc",
|
||||
"%xmm10", "%xmm11", "%xmm12",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG register i;
|
||||
BLASLONG register j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n0;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[4],*xbuffer;
|
||||
FLOAT *ytemp;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
xbuffer = buffer;
|
||||
ytemp = buffer + NBMAX;
|
||||
|
||||
n0 = n / NBMAX;
|
||||
n1 = (n % NBMAX) >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(NB,x_ptr,xbuffer,inc_x);
|
||||
|
||||
|
||||
FLOAT *ap[4];
|
||||
FLOAT *yp;
|
||||
BLASLONG register lda4 = 4 * lda;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
BLASLONG nb1 = NBMAX / 4;
|
||||
for( j=0; j<n0; j++)
|
||||
{
|
||||
|
||||
yp = ytemp;
|
||||
for( i = 0; i < nb1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += nb1 * inc_y * 4;
|
||||
a_ptr += nb1 * lda4 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
yp = ytemp;
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += n1 * inc_y * 4;
|
||||
a_ptr += n1 * lda4 ;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
|
||||
sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
|
||||
a_ptr += lda * 2;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
|
||||
a_ptr += lda;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 3 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 2 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
|
||||
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
|
||||
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if ( lda == 1 && inc_y == 1 )
|
||||
{
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j+1] += aj[j+1] * xtemp;
|
||||
y_ptr[j+2] += aj[j+2] * xtemp;
|
||||
y_ptr[j+3] += aj[j+3] * xtemp;
|
||||
}
|
||||
for ( ; j<n ; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp;
|
||||
aj += lda4 ;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -1,232 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "sgemv_t_microk_bulldozer.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "sgemv_t_microk_haswell.c"
|
||||
#else
|
||||
#include "sgemv_t_microk_sandy.c"
|
||||
#endif
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
dest++;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
FLOAT register temp0 = 0.0;
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n ; i++)
|
||||
{
|
||||
temp0 += a[i] * x[i];
|
||||
}
|
||||
temp0 *= alpha ;
|
||||
*y += temp0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *a_ptrl;
|
||||
BLASLONG m1;
|
||||
BLASLONG register m2;
|
||||
FLOAT *xbuffer;
|
||||
xbuffer = buffer;
|
||||
BLASLONG register Mblock;
|
||||
|
||||
m1 = m / 1024 ;
|
||||
m2 = m % 1024 ;
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
|
||||
for (j=0; j<m1; j++)
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(1024,x_ptr,xbuffer,inc_x);
|
||||
|
||||
y_ptr = y;
|
||||
a_ptrl = a_ptr;
|
||||
|
||||
for(i = 0; i<n; i++ )
|
||||
{
|
||||
sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
|
||||
y_ptr += inc_y;
|
||||
a_ptrl += lda;
|
||||
}
|
||||
a_ptr += 1024;
|
||||
x_ptr += 1024 * inc_x;
|
||||
}
|
||||
|
||||
if ( m2 == 0 ) return(0);
|
||||
|
||||
Mblock = 512;
|
||||
while ( Mblock >= 16 )
|
||||
{
|
||||
if ( m2 & Mblock)
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(Mblock,x_ptr,xbuffer,inc_x);
|
||||
|
||||
y_ptr = y;
|
||||
a_ptrl = a_ptr;
|
||||
|
||||
for(i = 0; i<n; i++ )
|
||||
{
|
||||
sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
|
||||
y_ptr += inc_y;
|
||||
a_ptrl += lda;
|
||||
}
|
||||
a_ptr += Mblock;
|
||||
x_ptr += Mblock * inc_x;
|
||||
|
||||
|
||||
}
|
||||
Mblock /= 2;
|
||||
|
||||
}
|
||||
|
||||
if ( m2 & Mblock)
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(Mblock,x_ptr,xbuffer,inc_x);
|
||||
|
||||
y_ptr = y;
|
||||
a_ptrl = a_ptr;
|
||||
|
||||
for(i = 0; i<n; i++ )
|
||||
{
|
||||
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
|
||||
y_ptr += inc_y;
|
||||
a_ptrl += lda;
|
||||
}
|
||||
a_ptr += Mblock;
|
||||
x_ptr += Mblock * inc_x;
|
||||
|
||||
|
||||
}
|
||||
Mblock /= 2;
|
||||
|
||||
|
||||
if ( m2 & Mblock)
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(Mblock,x_ptr,xbuffer,inc_x);
|
||||
|
||||
y_ptr = y;
|
||||
a_ptrl = a_ptr;
|
||||
|
||||
for(i = 0; i<n; i++ )
|
||||
{
|
||||
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
|
||||
y_ptr += inc_y;
|
||||
a_ptrl += lda;
|
||||
}
|
||||
a_ptr += Mblock;
|
||||
x_ptr += Mblock * inc_x;
|
||||
|
||||
|
||||
}
|
||||
Mblock /= 2;
|
||||
|
||||
if ( m2 & Mblock)
|
||||
{
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(Mblock,x_ptr,xbuffer,inc_x);
|
||||
|
||||
y_ptr = y;
|
||||
a_ptrl = a_ptr;
|
||||
|
||||
for(i = 0; i<n; i++ )
|
||||
{
|
||||
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
|
||||
y_ptr += inc_y;
|
||||
a_ptrl += lda;
|
||||
}
|
||||
a_ptr += Mblock;
|
||||
x_ptr += Mblock * inc_x;
|
||||
|
||||
|
||||
}
|
||||
Mblock /= 2;
|
||||
|
||||
if ( m2 & Mblock)
|
||||
{
|
||||
|
||||
xbuffer = x_ptr;
|
||||
|
||||
y_ptr = y;
|
||||
a_ptrl = a_ptr;
|
||||
|
||||
for(i = 0; i<n; i++ )
|
||||
{
|
||||
sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
|
||||
y_ptr += inc_y;
|
||||
a_ptrl += lda;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
".align 2 \n\t"
|
||||
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
"prefetcht0 384(%6,%0,4) \n\t"
|
||||
".align 2 \n\t"
|
||||
"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
"vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
|
||||
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
"prefetcht0 384(%7,%0,4) \n\t"
|
||||
"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vmovss %%xmm4, (%3) \n\t"
|
||||
"vmovss %%xmm5, 4(%3) \n\t"
|
||||
"vmovss %%xmm6, 8(%3) \n\t"
|
||||
"vmovss %%xmm7, 12(%3) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,99 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
//n = n / 16;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
|
||||
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
|
||||
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
|
||||
|
||||
"sarq $4, %%rax \n\t" // n = n / 16
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
// "prefetcht0 512(%%rsi) \n\t"
|
||||
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
|
||||
"vmovups (%%rsi), %%xmm4 \n\t"
|
||||
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
|
||||
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
|
||||
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
|
||||
|
||||
"vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
|
||||
"vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
|
||||
"addq $16*4 , %%rsi \n\t" // increment pointer of a
|
||||
"addq $16*4 , %%rdi \n\t" // increment pointer of c
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
|
||||
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
|
||||
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
|
||||
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
|
||||
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
|
||||
|
||||
"vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t"
|
||||
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
|
||||
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
"prefetcht0 384(%6,%0,4) \n\t"
|
||||
"vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t"
|
||||
"vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t"
|
||||
"prefetcht0 384(%7,%0,4) \n\t"
|
||||
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t"
|
||||
"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
|
||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
|
||||
"vextractf128 $1 , %%ymm7, %%xmm15 \n\t"
|
||||
|
||||
"vaddps %%xmm4, %%xmm12, %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm13, %%xmm5 \n\t"
|
||||
"vaddps %%xmm6, %%xmm14, %%xmm6 \n\t"
|
||||
"vaddps %%xmm7, %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vmovss %%xmm4, (%3) \n\t"
|
||||
"vmovss %%xmm5, 4(%3) \n\t"
|
||||
"vmovss %%xmm6, 8(%3) \n\t"
|
||||
"vmovss %%xmm7, 12(%3) \n\t"
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
//n = n / 16;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
|
||||
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
|
||||
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
|
||||
|
||||
"sarq $4, %%rax \n\t" // n = n / 16
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
// "prefetcht0 512(%%rsi) \n\t"
|
||||
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
|
||||
"vmovups (%%rsi), %%xmm4 \n\t"
|
||||
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
|
||||
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
|
||||
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
|
||||
|
||||
"vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
|
||||
"vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
|
||||
|
||||
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
|
||||
"addq $16*4 , %%rsi \n\t" // increment pointer of a
|
||||
"addq $16*4 , %%rdi \n\t" // increment pointer of c
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
|
||||
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
|
||||
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
|
||||
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
|
||||
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12\n\t"
|
||||
"vaddss (%%rdx), %%xmm12,%%xmm12\n\t"
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||
"xorps %%xmm6 , %%xmm6 \n\t"
|
||||
"xorps %%xmm7 , %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
|
||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0
|
||||
"movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1
|
||||
"movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2
|
||||
"movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3
|
||||
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm12, %%xmm9 \n\t"
|
||||
"mulps %%xmm12, %%xmm10 \n\t"
|
||||
"mulps %%xmm12, %%xmm11 \n\t"
|
||||
"addps %%xmm8 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"addps %%xmm9 , %%xmm5 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm10, %%xmm6 \n\t"
|
||||
"addps %%xmm11, %%xmm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"haddps %%xmm4, %%xmm4 \n\t"
|
||||
"haddps %%xmm5, %%xmm5 \n\t"
|
||||
"haddps %%xmm6, %%xmm6 \n\t"
|
||||
"haddps %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"haddps %%xmm4, %%xmm4 \n\t"
|
||||
"haddps %%xmm5, %%xmm5 \n\t"
|
||||
"haddps %%xmm6, %%xmm6 \n\t"
|
||||
"haddps %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"movss %%xmm4, (%3) \n\t"
|
||||
"movss %%xmm5, 4(%3) \n\t"
|
||||
"movss %%xmm6, 8(%3) \n\t"
|
||||
"movss %%xmm7, 12(%3) \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t"
|
||||
"vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t"
|
||||
"vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t"
|
||||
"vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t"
|
||||
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
|
||||
"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"vaddps %%xmm7, %%xmm11, %%xmm7 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"vaddps %%ymm7, %%ymm11, %%ymm7 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
|
||||
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t"
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
"vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t"
|
||||
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t"
|
||||
"vaddps %%ymm1, %%ymm10, %%ymm1 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
"prefetcht0 384(%6,%0,4) \n\t"
|
||||
"vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
"vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t"
|
||||
"prefetcht0 384(%7,%0,4) \n\t"
|
||||
"vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t"
|
||||
"vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t"
|
||||
"vaddps %%ymm7, %%ymm10, %%ymm7 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
|
||||
"vaddps %%ymm4, %%ymm0, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm1, %%ymm5 \n\t"
|
||||
"vaddps %%ymm6, %%ymm2, %%ymm6 \n\t"
|
||||
"vaddps %%ymm7, %%ymm3, %%ymm7 \n\t"
|
||||
|
||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
|
||||
"vextractf128 $1 , %%ymm7, %%xmm15 \n\t"
|
||||
|
||||
"vaddps %%xmm4, %%xmm12, %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm13, %%xmm5 \n\t"
|
||||
"vaddps %%xmm6, %%xmm14, %%xmm6 \n\t"
|
||||
"vaddps %%xmm7, %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
"vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"vmovss %%xmm4, (%3) \n\t"
|
||||
"vmovss %%xmm5, 4(%3) \n\t"
|
||||
"vmovss %%xmm6, 8(%3) \n\t"
|
||||
"vmovss %%xmm7, 12(%3) \n\t"
|
||||
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,106 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
|
||||
{
|
||||
|
||||
//n = n / 16;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"movq %0, %%rax\n\t" // n -> rax
|
||||
"vmovss %1, %%xmm1\n\t" // alpha -> xmm1
|
||||
"movq %2, %%rsi\n\t" // adress of a -> rsi
|
||||
"movq %3, %%rcx\n\t" // value of lda > rcx
|
||||
"movq %4, %%rdi\n\t" // adress of x -> rdi
|
||||
"movq %5, %%rdx\n\t" // adress of y -> rdx
|
||||
|
||||
"leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
|
||||
"leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
|
||||
|
||||
"vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
|
||||
"vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
|
||||
"vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
|
||||
"vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
|
||||
|
||||
"sarq $4, %%rax \n\t" // n = n / 16
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
// "prefetcht0 512(%%rsi) \n\t"
|
||||
"prefetcht0 (%%r8) \n\t" //prefetch next line of a
|
||||
"vmovups (%%rsi), %%xmm4 \n\t"
|
||||
"vmovups 4*4(%%rsi), %%xmm5 \n\t"
|
||||
"vmovups 8*4(%%rsi), %%xmm6 \n\t"
|
||||
"vmovups 12*4(%%rsi), %%xmm7 \n\t"
|
||||
|
||||
"vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
|
||||
"vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
|
||||
"vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp
|
||||
|
||||
"vaddps %%xmm12, %%xmm8 , %%xmm12\n\t"
|
||||
"vaddps %%xmm13, %%xmm9 , %%xmm13\n\t"
|
||||
"vaddps %%xmm14, %%xmm10, %%xmm14\n\t"
|
||||
"vaddps %%xmm15, %%xmm11, %%xmm15\n\t"
|
||||
|
||||
"addq $16*4 , %%r8 \n\t" // increment prefetch pointer
|
||||
"addq $16*4 , %%rsi \n\t" // increment pointer of a
|
||||
"addq $16*4 , %%rdi \n\t" // increment pointer of c
|
||||
"dec %%rax \n\t" // n = n -1
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
|
||||
"vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
|
||||
"vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
|
||||
"vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
|
||||
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
|
||||
"vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
|
||||
|
||||
"vmulss %%xmm12, %%xmm1, %%xmm12 \n\t"
|
||||
"vaddss (%%rdx), %%xmm12, %%xmm12\n\t"
|
||||
"vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
|
||||
|
||||
:
|
||||
:
|
||||
"m" (n), // 0
|
||||
"m" (alpha), // 1
|
||||
"m" (a), // 2
|
||||
"m" (lda), // 3
|
||||
"m" (x), // 4
|
||||
"m" (y) // 5
|
||||
: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
Data file for testing DSGESV/DSPOSV LAPACK routines
|
||||
12 Number of values of M
|
||||
0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension)
|
||||
0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension)
|
||||
6 Number of values of NRHS
|
||||
1 2 14 15 16 13 Values of NRHS (number of right hand sides)
|
||||
30.0 Threshold value of test ratio
|
||||
|
|
Loading…
Reference in New Issue