added optimized sger kernel for sandybridge
This commit is contained in:
parent
e216f686cb
commit
b2e1797dc6
|
@ -1,3 +1,5 @@
|
|||
SGERKERNEL = sger.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(SANDYBRIDGE)
|
||||
#include "sger_microk_sandy-2.c"
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
|
||||
FLOAT *x, BLASLONG incx,
|
||||
FLOAT *y, BLASLONG incy,
|
||||
FLOAT *a, BLASLONG lda, FLOAT *buffer){
|
||||
|
||||
FLOAT *X = x;
|
||||
|
||||
if (incx != 1) {
|
||||
X = buffer;
|
||||
COPY_K(m, x, incx, X, 1);
|
||||
}
|
||||
|
||||
BLASLONG m1 = m & -16;
|
||||
|
||||
while (n > 0)
|
||||
{
|
||||
FLOAT y0 = alpha * *y;
|
||||
if ( m1 > 0 )
|
||||
{
|
||||
#ifdef HAVE_KERNEL_16
|
||||
sger_kernel_16(m1, X, a, &y0);
|
||||
#else
|
||||
AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
if ( m > m1 )
|
||||
{
|
||||
AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0);
|
||||
}
|
||||
|
||||
a += lda;
|
||||
y += incy;
|
||||
n --;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastss (%4), %%xmm0 \n\t" // alpha
|
||||
"prefetcht0 256(%3,%0,4) \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm8 \n\t"
|
||||
"vmovups 16(%3,%0,4), %%xmm9 \n\t"
|
||||
"vmovups 32(%3,%0,4), %%xmm10 \n\t"
|
||||
"vmovups 48(%3,%0,4), %%xmm11 \n\t"
|
||||
|
||||
"prefetcht0 256(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm4 \n\t"
|
||||
"vmovups 16(%2,%0,4), %%xmm5 \n\t"
|
||||
"vmovups 32(%2,%0,4), %%xmm6 \n\t"
|
||||
"vmovups 48(%2,%0,4), %%xmm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmulps %%xmm4, %%xmm0, %%xmm4 \n\t"
|
||||
"vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t"
|
||||
"vmulps %%xmm5, %%xmm0, %%xmm5 \n\t"
|
||||
"vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t"
|
||||
"vmulps %%xmm6, %%xmm0, %%xmm6 \n\t"
|
||||
"vaddps %%xmm10, %%xmm6, %%xmm14 \n\t"
|
||||
"vmulps %%xmm7, %%xmm0, %%xmm7 \n\t"
|
||||
"vaddps %%xmm11, %%xmm7, %%xmm15 \n\t"
|
||||
|
||||
"prefetcht0 256(%3,%0,4) \n\t"
|
||||
"vmovups (%3,%0,4), %%xmm8 \n\t"
|
||||
"vmovups 16(%3,%0,4), %%xmm9 \n\t"
|
||||
"vmovups 32(%3,%0,4), %%xmm10 \n\t"
|
||||
"vmovups 48(%3,%0,4), %%xmm11 \n\t"
|
||||
|
||||
"prefetcht0 256(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm4 \n\t"
|
||||
"vmovups 16(%2,%0,4), %%xmm5 \n\t"
|
||||
"vmovups 32(%2,%0,4), %%xmm6 \n\t"
|
||||
"vmovups 48(%2,%0,4), %%xmm7 \n\t"
|
||||
|
||||
"vmovups %%xmm12, -64(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm13, -48(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm14, -32(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm15, -16(%3,%0,4) \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"vmulps %%xmm4, %%xmm0, %%xmm4 \n\t"
|
||||
"vmulps %%xmm5, %%xmm0, %%xmm5 \n\t"
|
||||
"vmulps %%xmm6, %%xmm0, %%xmm6 \n\t"
|
||||
"vmulps %%xmm7, %%xmm0, %%xmm7 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t"
|
||||
"vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t"
|
||||
"vaddps %%xmm10, %%xmm6, %%xmm14 \n\t"
|
||||
"vaddps %%xmm11, %%xmm7, %%xmm15 \n\t"
|
||||
|
||||
"vmovups %%xmm12, -64(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm13, -48(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm14, -32(%3,%0,4) \n\t"
|
||||
"vmovups %%xmm15, -16(%3,%0,4) \n\t"
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue