From 107c3ea7d56f933c6fdf350cab96f5f17086a73f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 12 Sep 2014 12:35:20 +0200 Subject: [PATCH] added optimized zgemv_t routine --- kernel/x86_64/zgemv_t_4.c | 492 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 kernel/x86_64/zgemv_t_4.c diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c new file mode 100644 index 000000000..3ffcf64b4 --- /dev/null +++ b/kernel/x86_64/zgemv_t_4.c @@ -0,0 +1,492 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "zgemv_t_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "zgemv_t_microk_haswell-2.c" +#endif +*/ + +#define NBMAX 1028 + +#ifndef HAVE_KERNEL_4x4 + +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +#endif + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i